-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls                                Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------------  
                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us     668.713ms       100.06%     668.713ms     222.904ms           0 B           0 B           0 B           0 B             3                                          []  
torchfx::prefix_scan_phase3(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     310.214ms        46.42%     310.214ms      11.489ms           0 B           0 B           0 B           0 B            27                                          []  
torchfx::prefix_scan_phase1(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     281.131ms        42.06%     281.131ms      10.412ms           0 B           0 B           0 B           0 B            27                                          []  
torchfx::forcing_kernel(double const*, double const*...         0.00%       0.000us         0.00%       0.000us       0.000us      27.358ms         4.09%      27.358ms       1.013ms           0 B           0 B           0 B           0 B            27                                          []  
                                          ProfilerStep*         0.63%       4.243ms       100.00%     669.309ms     223.103ms       0.000us         0.00%      26.589ms       8.863ms           0 B         -72 B           0 B     -17.93 GB             3                                          []  
                                               aten::to         0.01%      57.429us         0.14%     924.395us      30.813us       0.000us         0.00%      24.704ms     823.472us           0 B           0 B       5.15 GB           0 B            30              [[64, 480000], [], [], [], []]  
                                         aten::_to_copy         0.03%     176.034us         0.13%     866.966us      28.899us       0.000us         0.00%      24.704ms     823.472us           0 B           0 B       5.15 GB           0 B            30      [[64, 480000], [], [], [], [], [], []]  
                                            aten::copy_         0.03%     226.703us         0.07%     490.761us      16.359us      24.704ms         3.70%      24.704ms     823.472us           0 B           0 B           0 B           0 B            30            [[64, 480000], [64, 480000], []]  
     torchfx::prefix_scan_phase2(torchfx::Mat3x3*, int)         0.00%       0.000us         0.00%       0.000us       0.000us      23.048ms         3.45%      23.048ms     853.619us           0 B           0 B           0 B           0 B            27                                          []  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      13.560ms         2.03%      13.560ms     904.007us           0 B           0 B           0 B           0 B            15                                          []  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.144ms         1.67%      11.144ms     742.936us           0 B           0 B           0 B           0 B            15                                          []  
                                              aten::mul         0.01%      48.404us         0.01%      73.556us      24.519us       1.639ms         0.25%       1.639ms     546.239us           0 B           0 B     351.56 MB     351.56 MB             3                       [[32, 2, 480000], []]  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.639ms         0.25%       1.639ms     546.239us           0 B           0 B           0 B           0 B             3                                          []  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      99.939us         0.01%      99.939us       1.388us           0 B           0 B           0 B           0 B            72                                          []  
                                              aten::cat         0.05%     323.279us         0.08%     508.230us      18.823us      82.209us         0.01%      82.209us       3.045us           0 B           0 B      27.00 KB      27.00 KB            27                                    [[], []]  
void at::native::(anonymous namespace)::CatArrayBatc...         0.00%       0.000us         0.00%       0.000us       0.000us      82.209us         0.01%      82.209us       3.045us           0 B           0 B           0 B           0 B            27                                          []  
                                             aten::flip         0.04%     294.603us         0.10%     667.289us      24.714us      60.993us         0.01%      60.993us       2.259us           0 B           0 B      27.00 KB           0 B            27                               [[64, 2], []]  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      60.993us         0.01%      60.993us       2.259us           0 B           0 B           0 B           0 B            27                                          []  
                                            aten::copy_         0.04%     244.727us         0.09%     612.874us      12.768us      57.922us         0.01%      57.922us       1.207us           0 B           0 B           0 B           0 B            48                      [[64, 2], [64, 2], []]  
                                            aten::clone         0.01%      92.589us         0.08%     518.394us      21.600us       0.000us         0.00%      42.017us       1.751us           0 B           0 B      48.00 KB           0 B            24                            [[2, 64, 2], []]  
                                            aten::copy_         0.01%      95.796us         0.05%     313.602us      13.067us      42.017us         0.01%      42.017us       1.751us           0 B           0 B           0 B           0 B            24                [[2, 64, 2], [2, 64, 2], []]  
                                               aten::to         0.00%       4.329us        87.34%     584.575ms     194.858ms       0.000us         0.00%       3.136us       1.045us          72 B           0 B           0 B           0 B             3           [[3], [], [], [], [], [], [], []]  
                                         aten::_to_copy         0.00%      12.138us        87.34%     584.570ms     194.857ms       0.000us         0.00%       3.136us       1.045us          72 B           0 B           0 B           0 B             3               [[3], [], [], [], [], [], []]  
                                            aten::copy_         0.00%      17.694us        87.34%     584.549ms     194.850ms       3.136us         0.00%       3.136us       1.045us           0 B           0 B           0 B           0 B             3                              [[3], [3], []]  
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       3.136us         0.00%       3.136us       1.045us           0 B           0 B           0 B           0 B             3                                          []  
                                          aten::reshape         0.00%      30.918us         0.01%      75.468us       5.031us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            15                       [[32, 2, 480000], []]  
                                             aten::view         0.01%      44.550us         0.01%      44.550us       2.970us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            15                       [[32, 2, 480000], []]  
                                    aten::empty_strided         0.07%     449.058us         0.07%     449.058us       5.346us       0.000us         0.00%       0.000us       0.000us          72 B          72 B       5.15 GB       5.15 GB            84                    [[], [], [], [], [], []]  
                                       cudaLaunchKernel         0.17%       1.154ms         0.17%       1.154ms       5.918us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           195                                          []  
                                        cudaMemcpyAsync        87.42%     585.110ms        87.42%     585.110ms       7.801ms       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            75                                          []  
                                           aten::select         0.05%     357.723us         0.06%     424.758us       3.540us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           120                            [[2, 6], [], []]  
                                       aten::as_strided         0.01%      67.035us         0.01%      67.035us       0.559us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           120                        [[2, 6], [], [], []]  
                                           aten::select         0.04%     263.575us         0.05%     315.629us       2.630us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           120                               [[6], [], []]  
                                       aten::as_strided         0.01%      52.054us         0.01%      52.054us       0.434us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           120                           [[6], [], [], []]  
                                             aten::item         0.02%     125.023us         0.02%     162.201us       1.257us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           129                                        [[]]  
                              aten::_local_scalar_dense         0.01%      37.178us         0.01%      37.178us       0.288us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           129                                        [[]]  
                                           aten::select         0.04%     272.745us         0.05%     334.629us       3.486us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            96                        [[2, 64, 2], [], []]  
                                       aten::as_strided         0.01%      61.884us         0.01%      61.884us       0.645us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            96                    [[2, 64, 2], [], [], []]  
                                            aten::empty         0.06%     384.397us         0.06%     384.397us       4.746us       0.000us         0.00%       0.000us       0.000us           0 B           0 B      12.43 GB      12.43 GB            81                    [[], [], [], [], [], []]  
                                           aten::select         0.03%     186.487us         0.03%     224.713us       4.161us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            54                      [[64, 480000], [], []]  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------------  
Self CPU time total: 669.312ms
Self CUDA time total: 668.339ms
