-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls                                Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------------  
                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us     673.889ms       100.79%     673.889ms     224.630ms           0 B           0 B           0 B           0 B             3                                          []  
torchfx::prefix_scan_phase3(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     310.350ms        46.42%     310.350ms      11.494ms           0 B           0 B           0 B           0 B            27                                          []  
torchfx::prefix_scan_phase1(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     281.392ms        42.08%     281.392ms      10.422ms           0 B           0 B           0 B           0 B            27                                          []  
torchfx::forcing_kernel(double const*, double const*...         0.00%       0.000us         0.00%       0.000us       0.000us      27.358ms         4.09%      27.358ms       1.013ms           0 B           0 B           0 B           0 B            27                                          []  
                                          ProfilerStep*         1.25%       8.472ms       100.00%     675.132ms     225.044ms       0.000us         0.00%      26.608ms       8.869ms           0 B      -1.20 KB           0 B     -17.93 GB             3                                          []  
                                               aten::to         0.01%      94.075us         0.22%       1.497ms      49.914us       0.000us         0.00%      24.693ms     823.103us           0 B           0 B       5.15 GB           0 B            30              [[64, 480000], [], [], [], []]  
                                         aten::_to_copy         0.04%     281.531us         0.21%       1.403ms      46.778us       0.000us         0.00%      24.693ms     823.103us           0 B           0 B       5.15 GB           0 B            30      [[64, 480000], [], [], [], [], [], []]  
                                            aten::copy_         0.06%     383.112us         0.13%     868.026us      28.934us      24.693ms         3.69%      24.693ms     823.103us           0 B           0 B           0 B           0 B            30            [[64, 480000], [64, 480000], []]  
     torchfx::prefix_scan_phase2(torchfx::Mat3x3*, int)         0.00%       0.000us         0.00%       0.000us       0.000us      22.925ms         3.43%      22.925ms     849.089us           0 B           0 B           0 B           0 B            27                                          []  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      13.562ms         2.03%      13.562ms     645.829us           0 B           0 B           0 B           0 B            21                                          []  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.145ms         1.67%      11.145ms     742.979us           0 B           0 B           0 B           0 B            15                                          []  
                                              aten::mul         0.01%      82.634us         0.02%     122.782us      40.927us       1.634ms         0.24%       1.634ms     544.637us           0 B           0 B     351.56 MB     351.56 MB             3                       [[32, 2, 480000], []]  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.634ms         0.24%       1.634ms     544.637us           0 B           0 B           0 B           0 B             3                                          []  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      98.623us         0.01%      98.623us       1.370us           0 B           0 B           0 B           0 B            72                                          []  
                                              aten::cat         0.09%     608.983us         0.14%     914.429us      33.868us      81.283us         0.01%      81.283us       3.010us           0 B           0 B      27.00 KB      27.00 KB            27                                    [[], []]  
void at::native::(anonymous namespace)::CatArrayBatc...         0.00%       0.000us         0.00%       0.000us       0.000us      81.283us         0.01%      81.283us       3.010us           0 B           0 B           0 B           0 B            27                                          []  
                                             aten::flip         0.07%     477.716us         0.15%       1.042ms      38.607us      60.891us         0.01%      60.891us       2.255us           0 B           0 B      27.00 KB           0 B            27                               [[64, 2], []]  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      60.891us         0.01%      60.891us       2.255us           0 B           0 B           0 B           0 B            27                                          []  
                                            aten::copy_         0.05%     309.355us         0.11%     747.577us      15.575us      58.046us         0.01%      58.046us       1.209us           0 B           0 B           0 B           0 B            48                      [[64, 2], [64, 2], []]  
                                            aten::clone         0.02%     151.925us         0.11%     745.456us      31.061us       0.000us         0.00%      40.577us       1.691us           0 B           0 B      48.00 KB           0 B            24                            [[2, 64, 2], []]  
                                            aten::copy_         0.02%     116.377us         0.06%     426.179us      17.757us      40.577us         0.01%      40.577us       1.691us           0 B           0 B           0 B           0 B            24                [[2, 64, 2], [2, 64, 2], []]  
                                               aten::to         0.01%      62.290us        64.95%     438.489ms      18.270ms       0.000us         0.00%      17.793us       0.741us       1.12 KB           0 B       6.00 KB           0 B            24                [[2, 6], [], [], [], [], []]  
                                         aten::_to_copy         0.03%     200.437us        64.94%     438.427ms      18.268ms       0.000us         0.00%      17.793us       0.741us       1.12 KB           0 B       6.00 KB           0 B            24            [[2, 6], [], [], [], [], [], []]  
                                            aten::copy_         0.03%     232.788us        64.88%     437.996ms      18.250ms      17.793us         0.00%      17.793us       0.741us           0 B           0 B           0 B           0 B            24                        [[2, 6], [2, 6], []]  
                                         aten::_to_copy         0.01%      52.674us         0.40%       2.715ms     301.643us       0.000us         0.00%      16.736us       1.860us          72 B           0 B       3.00 KB           0 B             9               [[3], [], [], [], [], [], []]  
                                            aten::copy_         0.01%      68.869us         0.39%       2.604ms     289.283us      16.736us         0.00%      16.736us       1.860us           0 B           0 B           0 B           0 B             9                              [[3], [3], []]  
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us      15.873us         0.00%      15.873us       1.058us           0 B           0 B           0 B           0 B            15                                          []  
                                               aten::to         0.00%      10.992us         0.03%     193.553us      32.259us       0.000us         0.00%      13.984us       2.331us           0 B           0 B       3.00 KB           0 B             6                       [[3], [], [], [], []]  
                                             aten::item         0.03%     203.780us        21.13%     142.678ms       1.057ms       0.000us         0.00%       5.952us       0.044us           0 B           0 B           0 B           0 B           135                                        [[]]  
                              aten::_local_scalar_dense         0.02%     140.363us        21.10%     142.474ms       1.055ms       5.952us         0.00%       5.952us       0.044us           0 B           0 B           0 B           0 B           135                                        [[]]  
                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us       5.952us         0.00%       5.952us       0.992us           0 B           0 B           0 B           0 B             6                                          []  
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       4.672us         0.00%       4.672us       0.389us           0 B           0 B           0 B           0 B            12                                          []  
                                               aten::to         0.00%       6.810us         0.38%       2.539ms     846.346us       0.000us         0.00%       2.752us       0.917us          72 B           0 B           0 B           0 B             3           [[3], [], [], [], [], [], [], []]  
                                          aten::reshape         0.01%      45.378us         0.02%     119.571us       7.971us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            15                       [[32, 2, 480000], []]  
                                             aten::view         0.01%      74.193us         0.01%      74.193us       4.946us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            15                       [[32, 2, 480000], []]  
                                    aten::empty_strided         0.13%     867.325us         0.13%     867.325us       7.608us       0.000us         0.00%       0.000us       0.000us       1.20 KB       1.20 KB       5.15 GB       5.15 GB           114                    [[], [], [], [], [], []]  
                                        cudaMemcpyAsync         2.05%      13.824ms         2.05%      13.824ms     131.660us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                                          []  
                                  cudaStreamSynchronize        84.35%     569.495ms        84.35%     569.495ms      17.257ms       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            33                                          []  
                                       cudaLaunchKernel         0.29%       1.970ms         0.29%       1.970ms       9.803us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           201                                          []  
                                           aten::detach         0.01%      43.906us         0.02%     116.051us       9.671us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            12                                    [[2, 6]]  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------------  
Self CPU time total: 675.136ms
Self CUDA time total: 668.634ms
