-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls                                 Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------  
                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us        1.343s       100.01%        1.343s     447.716ms           0 B           0 B           0 B           0 B             3                                           []  
torchfx::prefix_scan_phase3(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     453.454ms        33.76%     453.454ms      21.593ms           0 B           0 B           0 B           0 B            21                                           []  
     torchfx::prefix_scan_phase2(torchfx::Mat3x3*, int)         0.00%       0.000us         0.00%       0.000us       0.000us     416.830ms        31.04%     416.830ms      19.849ms           0 B           0 B           0 B           0 B            21                                           []  
torchfx::prefix_scan_phase1(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     411.074ms        30.61%     411.074ms      19.575ms           0 B           0 B           0 B           0 B            21                                           []  
torchfx::forcing_kernel(double const*, double const*...         0.00%       0.000us         0.00%       0.000us       0.000us      39.876ms         2.97%      39.876ms       1.899ms           0 B           0 B           0 B           0 B            21                                           []  
                                          ProfilerStep*         0.36%       4.810ms       100.00%        1.345s     448.367ms       0.000us         0.00%      21.797ms       7.266ms           0 B           0 B           0 B     -22.66 GB             3                                           []  
                                               aten::to         0.00%      63.849us         0.07%     884.637us      73.720us       0.000us         0.00%      18.544ms       1.545ms           0 B           0 B       3.87 GB           0 B            12              [[2, 28800000], [], [], [], []]  
                                         aten::_to_copy         0.01%     189.673us         0.06%     820.788us      68.399us       0.000us         0.00%      18.544ms       1.545ms           0 B           0 B       3.87 GB           0 B            12      [[2, 28800000], [], [], [], [], [], []]  
                                            aten::copy_         0.01%     189.380us         0.03%     462.610us      38.551us      18.544ms         1.38%      18.544ms       1.545ms           0 B           0 B           0 B           0 B            12           [[2, 28800000], [2, 28800000], []]  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      10.171ms         0.76%      10.171ms       1.695ms           0 B           0 B           0 B           0 B             6                                           []  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.373ms         0.62%       8.373ms       1.395ms           0 B           0 B           0 B           0 B             6                                           []  
                                              aten::mul         0.01%      78.957us         0.01%     119.816us      39.939us       3.073ms         0.23%       3.073ms       1.024ms           0 B           0 B     660.00 MB     660.00 MB             3                          [[2, 28800000], []]  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.073ms         0.23%       3.073ms       1.024ms           0 B           0 B           0 B           0 B             3                                           []  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      70.204us         0.01%      70.204us       1.300us           0 B           0 B           0 B           0 B            54                                           []  
                                              aten::cat         0.03%     357.308us         0.04%     548.429us      26.116us      58.113us         0.00%      58.113us       2.767us           0 B           0 B      10.50 KB      10.50 KB            21                                     [[], []]  
void at::native::(anonymous namespace)::CatArrayBatc...         0.00%       0.000us         0.00%       0.000us       0.000us      58.113us         0.00%      58.113us       2.767us           0 B           0 B           0 B           0 B            21                                           []  
                                            aten::copy_         0.02%     259.841us         0.05%     627.928us      14.951us      49.852us         0.00%      49.852us       1.187us           0 B           0 B           0 B           0 B            42                         [[2, 2], [2, 2], []]  
                                             aten::flip         0.02%     302.144us         0.05%     654.333us      31.159us      44.354us         0.00%      44.354us       2.112us           0 B           0 B      10.50 KB           0 B            21                                 [[2, 2], []]  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      44.354us         0.00%      44.354us       2.112us           0 B           0 B           0 B           0 B            21                                           []  
                                            aten::clone         0.00%      33.626us         0.01%     163.370us      27.228us       0.000us         0.00%      10.176us       1.696us           0 B           0 B       3.00 KB           0 B             6                              [[3, 2, 2], []]  
                                            aten::copy_         0.00%      30.124us         0.01%      96.690us      16.115us      10.176us         0.00%      10.176us       1.696us           0 B           0 B           0 B           0 B             6                   [[3, 2, 2], [3, 2, 2], []]  
                                            aten::clone         0.00%      40.591us         0.02%     260.127us      43.355us       0.000us         0.00%      10.176us       1.696us           0 B           0 B       3.00 KB           0 B             6                              [[4, 2, 2], []]  
                                            aten::copy_         0.00%      39.909us         0.01%     175.581us      29.263us      10.176us         0.00%      10.176us       1.696us           0 B           0 B           0 B           0 B             6                   [[4, 2, 2], [4, 2, 2], []]  
                                       cudaLaunchKernel         0.09%       1.169ms         0.09%       1.169ms       8.287us       7.872us         0.00%       7.872us       0.056us           0 B           0 B           0 B           0 B           141                                           []  
                                    aten::empty_strided         0.03%     362.306us         0.03%     362.306us       8.051us       0.000us         0.00%       0.000us       0.000us           0 B           0 B       3.87 GB       3.87 GB            45                     [[], [], [], [], [], []]  
                                        cudaMemcpyAsync         0.04%     570.325us         0.04%     570.325us      10.562us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            54                                           []  
                                           aten::select         0.02%     241.247us         0.02%     284.855us       4.748us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            60                             [[4, 6], [], []]  
                                       aten::as_strided         0.00%      43.608us         0.00%      43.608us       0.727us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            60                         [[4, 6], [], [], []]  
                                           aten::select         0.03%     356.691us         0.03%     410.048us       3.905us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                                [[6], [], []]  
                                       aten::as_strided         0.00%      53.357us         0.00%      53.357us       0.508us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                            [[6], [], [], []]  
                                             aten::item         0.01%     140.060us         0.01%     181.862us       1.732us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                                         [[]]  
                              aten::_local_scalar_dense         0.00%      41.802us         0.00%      41.802us       0.398us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                                         [[]]  
                                           aten::select         0.01%     178.539us         0.02%     210.533us       4.386us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            48                          [[4, 2, 2], [], []]  
                                       aten::as_strided         0.00%      31.994us         0.00%      31.994us       0.667us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            48                      [[4, 2, 2], [], [], []]  
                                            aten::empty         0.03%     340.696us         0.03%     340.696us       5.408us       0.000us         0.00%       0.000us       0.000us           0 B           0 B      18.15 GB      18.15 GB            63                     [[], [], [], [], [], []]  
                                           aten::select         0.01%     150.654us         0.01%     187.026us       4.453us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            42                      [[2, 28800000], [], []]  
                                       aten::as_strided         0.00%      54.675us         0.00%      54.675us       0.868us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            63                  [[2, 28800000], [], [], []]  
                                        aten::unsqueeze         0.01%     145.861us         0.01%     179.573us       4.276us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            42                                    [[2], []]  
                                       aten::as_strided         0.00%      33.712us         0.00%      33.712us       0.803us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            42                            [[2], [], [], []]  
                                           aten::narrow         0.01%      82.718us         0.02%     215.919us      10.282us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            21                  [[2, 28800000], [], [], []]  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------  
Self CPU time total: 1.345s
Self CUDA time total: 1.343s
