-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls                                Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------------  
                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us     669.549ms       100.03%     669.549ms     223.183ms           0 B           0 B           0 B           0 B             3                                          []  
torchfx::prefix_scan_phase3(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     310.785ms        46.43%     310.785ms      11.511ms           0 B           0 B           0 B           0 B            27                                          []  
torchfx::prefix_scan_phase1(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     281.707ms        42.09%     281.707ms      10.434ms           0 B           0 B           0 B           0 B            27                                          []  
torchfx::forcing_kernel(double const*, double const*...         0.00%       0.000us         0.00%       0.000us       0.000us      27.354ms         4.09%      27.354ms       1.013ms           0 B           0 B           0 B           0 B            27                                          []  
                                          ProfilerStep*         0.92%       6.149ms       100.00%     671.042ms     223.681ms       0.000us         0.00%      26.580ms       8.860ms           0 B           0 B           0 B     -17.93 GB             3                                          []  
                                               aten::to         0.01%      82.529us         0.20%       1.320ms      44.003us       0.000us         0.00%      24.691ms     823.018us           0 B           0 B       5.15 GB           0 B            30              [[64, 480000], [], [], [], []]  
                                         aten::_to_copy         0.04%     235.427us         0.18%       1.238ms      41.252us       0.000us         0.00%      24.691ms     823.018us           0 B           0 B       5.15 GB           0 B            30      [[64, 480000], [], [], [], [], [], []]  
                                            aten::copy_         0.05%     319.795us         0.11%     716.223us      23.874us      24.691ms         3.69%      24.691ms     823.018us           0 B           0 B           0 B           0 B            30            [[64, 480000], [64, 480000], []]  
     torchfx::prefix_scan_phase2(torchfx::Mat3x3*, int)         0.00%       0.000us         0.00%       0.000us       0.000us      22.941ms         3.43%      22.941ms     849.675us           0 B           0 B           0 B           0 B            27                                          []  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      13.552ms         2.02%      13.552ms     903.463us           0 B           0 B           0 B           0 B            15                                          []  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.139ms         1.66%      11.139ms     742.573us           0 B           0 B           0 B           0 B            15                                          []  
                                              aten::mul         0.01%      81.237us         0.02%     118.026us      39.342us       1.630ms         0.24%       1.630ms     543.199us           0 B           0 B     351.56 MB     351.56 MB             3                       [[32, 2, 480000], []]  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.630ms         0.24%       1.630ms     543.199us           0 B           0 B           0 B           0 B             3                                          []  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     118.372us         0.02%     118.372us       1.409us           0 B           0 B           0 B           0 B            84                                          []  
                                              aten::cat         0.06%     432.838us         0.10%     664.061us      24.595us      80.895us         0.01%      80.895us       2.996us           0 B           0 B      27.00 KB      27.00 KB            27                                    [[], []]  
void at::native::(anonymous namespace)::CatArrayBatc...         0.00%       0.000us         0.00%       0.000us       0.000us      80.895us         0.01%      80.895us       2.996us           0 B           0 B           0 B           0 B            27                                          []  
                                            aten::copy_         0.05%     353.949us         0.13%     853.189us      15.800us      64.930us         0.01%      64.930us       1.202us           0 B           0 B           0 B           0 B            54                      [[64, 2], [64, 2], []]  
                                             aten::flip         0.06%     375.487us         0.12%     832.686us      30.840us      60.804us         0.01%      60.804us       2.252us           0 B           0 B      27.00 KB           0 B            27                               [[64, 2], []]  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      60.804us         0.01%      60.804us       2.252us           0 B           0 B           0 B           0 B            27                                          []  
                                            aten::clone         0.02%     154.599us         0.11%     749.500us      31.229us       0.000us         0.00%      42.882us       1.787us           0 B           0 B      48.00 KB           0 B            24                            [[2, 64, 2], []]  
                                            aten::copy_         0.02%     148.249us         0.07%     454.344us      18.931us      42.882us         0.01%      42.882us       1.787us           0 B           0 B           0 B           0 B            24                [[2, 64, 2], [2, 64, 2], []]  
                                            aten::clone         0.00%      30.939us         0.02%     152.650us      25.442us       0.000us         0.00%      10.560us       1.760us           0 B           0 B       6.00 KB           0 B             6                            [[1, 64, 2], []]  
                                            aten::copy_         0.00%      28.175us         0.01%      89.383us      14.897us      10.560us         0.00%      10.560us       1.760us           0 B           0 B           0 B           0 B             6                [[1, 64, 2], [1, 64, 2], []]  
                                          aten::reshape         0.01%      51.542us         0.02%     148.182us       9.879us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            15                       [[32, 2, 480000], []]  
                                             aten::view         0.01%      96.640us         0.01%      96.640us       6.443us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            15                       [[32, 2, 480000], []]  
                                    aten::empty_strided         0.09%     621.878us         0.09%     621.878us       7.148us       0.000us         0.00%       0.000us       0.000us           0 B           0 B       5.15 GB       5.15 GB            87                    [[], [], [], [], [], []]  
                                       cudaLaunchKernel         0.22%       1.484ms         0.22%       1.484ms       7.609us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           195                                          []  
                                        cudaMemcpyAsync         0.13%     866.543us         0.13%     866.543us      10.316us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            84                                          []  
                                           aten::select         0.06%     430.477us         0.08%     510.275us       4.252us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           120                            [[2, 6], [], []]  
                                       aten::as_strided         0.01%      79.798us         0.01%      79.798us       0.665us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           120                        [[2, 6], [], [], []]  
                                           aten::select         0.06%     386.585us         0.07%     462.285us       3.424us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           135                               [[6], [], []]  
                                       aten::as_strided         0.01%      75.700us         0.01%      75.700us       0.561us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           135                           [[6], [], [], []]  
                                             aten::item         0.04%     241.929us         0.04%     294.054us       2.178us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           135                                        [[]]  
                              aten::_local_scalar_dense         0.01%      52.125us         0.01%      52.125us       0.386us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           135                                        [[]]  
                                           aten::select         0.06%     374.827us         0.07%     443.393us       4.619us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            96                        [[2, 64, 2], [], []]  
                                       aten::as_strided         0.01%      68.566us         0.01%      68.566us       0.714us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            96                    [[2, 64, 2], [], [], []]  
                                            aten::empty         0.07%     463.707us         0.07%     463.707us       5.725us       0.000us         0.00%       0.000us       0.000us           0 B           0 B      12.43 GB      12.43 GB            81                    [[], [], [], [], [], []]  
                                           aten::select         0.03%     200.573us         0.04%     244.566us       4.529us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            54                      [[64, 480000], [], []]  
                                       aten::as_strided         0.01%      72.118us         0.01%      72.118us       0.890us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            81                  [[64, 480000], [], [], []]  
                                        aten::unsqueeze         0.03%     184.502us         0.03%     225.207us       4.170us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            54                                  [[64], []]  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------------  
Self CPU time total: 671.049ms
Self CUDA time total: 669.367ms
