-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls                                 Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------  
                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us        1.343s       100.15%        1.343s     447.598ms           0 B           0 B           0 B           0 B             3                                           []  
torchfx::prefix_scan_phase3(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     452.502ms        33.75%     452.502ms      21.548ms           0 B           0 B           0 B           0 B            21                                           []  
     torchfx::prefix_scan_phase2(torchfx::Mat3x3*, int)         0.00%       0.000us         0.00%       0.000us       0.000us     416.395ms        31.05%     416.395ms      19.828ms           0 B           0 B           0 B           0 B            21                                           []  
torchfx::prefix_scan_phase1(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     410.257ms        30.60%     410.257ms      19.536ms           0 B           0 B           0 B           0 B            21                                           []  
torchfx::forcing_kernel(double const*, double const*...         0.00%       0.000us         0.00%       0.000us       0.000us      39.882ms         2.97%      39.882ms       1.899ms           0 B           0 B           0 B           0 B            21                                           []  
                                          ProfilerStep*         0.38%       5.062ms       100.00%        1.344s     447.887ms       0.000us         0.00%      21.805ms       7.268ms           0 B           0 B           0 B     -22.66 GB             3                                           []  
                                               aten::to         0.00%      50.822us         0.06%     747.090us      62.258us       0.000us         0.00%      18.563ms       1.547ms           0 B           0 B       3.87 GB           0 B            12              [[2, 28800000], [], [], [], []]  
                                         aten::_to_copy         0.01%     173.908us         0.05%     696.268us      58.022us       0.000us         0.00%      18.563ms       1.547ms           0 B           0 B       3.87 GB           0 B            12      [[2, 28800000], [], [], [], [], [], []]  
                                            aten::copy_         0.01%     158.653us         0.03%     374.466us      31.206us      18.563ms         1.38%      18.563ms       1.547ms           0 B           0 B           0 B           0 B            12           [[2, 28800000], [2, 28800000], []]  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      10.186ms         0.76%      10.186ms       1.698ms           0 B           0 B           0 B           0 B             6                                           []  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.377ms         0.62%       8.377ms       1.396ms           0 B           0 B           0 B           0 B             6                                           []  
                                              aten::mul         0.01%      82.496us         0.01%     125.984us      41.995us       3.062ms         0.23%       3.062ms       1.021ms           0 B           0 B     660.00 MB     660.00 MB             3                          [[2, 28800000], []]  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.062ms         0.23%       3.062ms       1.021ms           0 B           0 B           0 B           0 B             3                                           []  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      70.595us         0.01%      70.595us       1.307us           0 B           0 B           0 B           0 B            54                                           []  
                                              aten::cat         0.03%     436.450us         0.05%     693.361us      33.017us      58.144us         0.00%      58.144us       2.769us           0 B           0 B      10.50 KB      10.50 KB            21                                     [[], []]  
void at::native::(anonymous namespace)::CatArrayBatc...         0.00%       0.000us         0.00%       0.000us       0.000us      58.144us         0.00%      58.144us       2.769us           0 B           0 B           0 B           0 B            21                                           []  
                                            aten::copy_         0.03%     428.546us         0.08%       1.019ms      24.263us      50.147us         0.00%      50.147us       1.194us           0 B           0 B           0 B           0 B            42                         [[2, 2], [2, 2], []]  
                                             aten::flip         1.83%      24.640ms         1.88%      25.248ms       1.202ms      44.286us         0.00%      44.286us       2.109us           0 B           0 B      10.50 KB           0 B            21                                 [[2, 2], []]  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      44.286us         0.00%      44.286us       2.109us           0 B           0 B           0 B           0 B            21                                           []  
                                            aten::clone         0.00%      48.306us         0.02%     248.729us      41.455us       0.000us         0.00%      10.304us       1.717us           0 B           0 B       3.00 KB           0 B             6                              [[3, 2, 2], []]  
                                            aten::copy_         0.00%      45.798us         0.01%     143.586us      23.931us      10.304us         0.00%      10.304us       1.717us           0 B           0 B           0 B           0 B             6                   [[3, 2, 2], [3, 2, 2], []]  
                                            aten::clone         0.00%      25.375us         0.01%     158.131us      26.355us       0.000us         0.00%      10.144us       1.691us           0 B           0 B       3.00 KB           0 B             6                              [[4, 2, 2], []]  
                                            aten::copy_         0.00%      26.373us         0.01%     101.526us      16.921us      10.144us         0.00%      10.144us       1.691us           0 B           0 B           0 B           0 B             6                   [[4, 2, 2], [4, 2, 2], []]  
                                       cudaLaunchKernel         0.12%       1.582ms         0.12%       1.582ms      11.219us       7.904us         0.00%       7.904us       0.056us           0 B           0 B           0 B           0 B           141                                           []  
                                    aten::empty_strided         0.03%     453.217us         0.03%     453.217us      10.071us       0.000us         0.00%       0.000us       0.000us           0 B           0 B       3.87 GB       3.87 GB            45                     [[], [], [], [], [], []]  
                                        cudaMemcpyAsync         0.06%     763.443us         0.06%     763.443us      14.138us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            54                                           []  
                                           aten::select         0.03%     343.427us         0.03%     404.324us       6.739us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            60                             [[4, 6], [], []]  
                                       aten::as_strided         0.00%      60.897us         0.00%      60.897us       1.015us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            60                         [[4, 6], [], [], []]  
                                           aten::select         0.04%     508.129us         0.04%     592.646us       5.644us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                                [[6], [], []]  
                                       aten::as_strided         0.01%      84.517us         0.01%      84.517us       0.805us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                            [[6], [], [], []]  
                                             aten::item         0.24%       3.171ms         0.24%       3.228ms      30.743us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                                         [[]]  
                              aten::_local_scalar_dense         0.00%      57.349us         0.00%      57.349us       0.546us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                                         [[]]  
                                           aten::select         0.02%     269.795us         0.02%     324.826us       6.767us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            48                          [[4, 2, 2], [], []]  
                                       aten::as_strided         0.00%      55.031us         0.00%      55.031us       1.146us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            48                      [[4, 2, 2], [], [], []]  
                                            aten::empty         0.04%     581.346us         0.04%     581.346us       9.228us       0.000us         0.00%       0.000us       0.000us           0 B           0 B      18.15 GB      18.15 GB            63                     [[], [], [], [], [], []]  
                                           aten::select         0.02%     236.515us         0.02%     286.446us       6.820us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            42                      [[2, 28800000], [], []]  
                                       aten::as_strided         0.01%      79.114us         0.01%      79.114us       1.256us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            63                  [[2, 28800000], [], [], []]  
                                        aten::unsqueeze         0.02%     201.988us         0.02%     246.197us       5.862us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            42                                    [[2], []]  
                                       aten::as_strided         0.00%      44.209us         0.00%      44.209us       1.053us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            42                            [[2], [], [], []]  
                                           aten::narrow         0.01%     110.527us         0.02%     300.343us      14.302us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            21                  [[2, 28800000], [], [], []]  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------  
Self CPU time total: 1.344s
Self CUDA time total: 1.341s
