-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls                                 Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------  
                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us        1.347s       100.17%        1.347s     449.045ms           0 B           0 B           0 B           0 B             3                                           []  
torchfx::prefix_scan_phase3(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     453.657ms        33.73%     453.657ms      21.603ms           0 B           0 B           0 B           0 B            21                                           []  
     torchfx::prefix_scan_phase2(torchfx::Mat3x3*, int)         0.00%       0.000us         0.00%       0.000us       0.000us     417.565ms        31.05%     417.565ms      19.884ms           0 B           0 B           0 B           0 B            21                                           []  
torchfx::prefix_scan_phase1(double const*, double*, ...         0.00%       0.000us         0.00%       0.000us       0.000us     411.940ms        30.63%     411.940ms      19.616ms           0 B           0 B           0 B           0 B            21                                           []  
torchfx::forcing_kernel(double const*, double const*...         0.00%       0.000us         0.00%       0.000us       0.000us      39.876ms         2.97%      39.876ms       1.899ms           0 B           0 B           0 B           0 B            21                                           []  
                                          ProfilerStep*         0.40%       5.423ms       100.00%        1.349s     449.581ms       0.000us         0.00%      21.768ms       7.256ms           0 B       -1008 B           0 B     -22.66 GB             3                                           []  
                                               aten::to         0.00%      37.795us         0.04%     587.424us      48.952us       0.000us         0.00%      18.523ms       1.544ms           0 B           0 B       3.87 GB           0 B            12              [[2, 28800000], [], [], [], []]  
                                         aten::_to_copy         0.01%     103.264us         0.04%     549.629us      45.802us       0.000us         0.00%      18.523ms       1.544ms           0 B           0 B       3.87 GB           0 B            12      [[2, 28800000], [], [], [], [], [], []]  
                                            aten::copy_         0.01%     141.431us         0.03%     350.575us      29.215us      18.523ms         1.38%      18.523ms       1.544ms           0 B           0 B           0 B           0 B            12           [[2, 28800000], [2, 28800000], []]  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      10.149ms         0.75%      10.149ms       1.692ms           0 B           0 B           0 B           0 B             6                                           []  
void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.374ms         0.62%       8.374ms       1.396ms           0 B           0 B           0 B           0 B             6                                           []  
                                              aten::mul         0.01%      72.554us         0.01%     106.346us      35.449us       3.062ms         0.23%       3.062ms       1.021ms           0 B           0 B     660.00 MB     660.00 MB             3                          [[2, 28800000], []]  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.062ms         0.23%       3.062ms       1.021ms           0 B           0 B           0 B           0 B             3                                           []  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      69.693us         0.01%      69.693us       1.291us           0 B           0 B           0 B           0 B            54                                           []  
                                              aten::cat         0.03%     401.457us         0.04%     605.493us      28.833us      58.271us         0.00%      58.271us       2.775us           0 B           0 B      10.50 KB      10.50 KB            21                                     [[], []]  
void at::native::(anonymous namespace)::CatArrayBatc...         0.00%       0.000us         0.00%       0.000us       0.000us      58.271us         0.00%      58.271us       2.775us           0 B           0 B           0 B           0 B            21                                           []  
                                            aten::copy_         0.02%     279.877us         0.05%     685.997us      16.333us      50.237us         0.00%      50.237us       1.196us           0 B           0 B           0 B           0 B            42                         [[2, 2], [2, 2], []]  
                                             aten::flip         3.02%      40.715ms         3.06%      41.235ms       1.964ms      44.219us         0.00%      44.219us       2.106us           0 B           0 B      10.50 KB           0 B            21                                 [[2, 2], []]  
void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      44.219us         0.00%      44.219us       2.106us           0 B           0 B           0 B           0 B            21                                           []  
                                            aten::clone         0.00%      38.765us         0.01%     198.410us      33.068us       0.000us         0.00%       9.760us       1.627us           0 B           0 B       3.00 KB           0 B             6                              [[4, 2, 2], []]  
                                            aten::copy_         0.00%      36.191us         0.01%     118.137us      19.689us       9.760us         0.00%       9.760us       1.627us           0 B           0 B           0 B           0 B             6                   [[4, 2, 2], [4, 2, 2], []]  
                                            aten::clone         0.00%      31.628us         0.01%     166.477us      27.746us       0.000us         0.00%       9.696us       1.616us           0 B           0 B       3.00 KB           0 B             6                              [[3, 2, 2], []]  
                                            aten::copy_         0.00%      29.766us         0.01%     100.208us      16.701us       9.696us         0.00%       9.696us       1.616us           0 B           0 B           0 B           0 B             6                   [[3, 2, 2], [3, 2, 2], []]  
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       6.529us         0.00%       6.529us       1.088us           0 B           0 B           0 B           0 B             6                                           []  
                                               aten::to         0.00%      31.808us         0.40%       5.399ms     899.804us       0.000us         0.00%       4.449us       0.742us         576 B           0 B       1.50 KB           0 B             6                 [[4, 6], [], [], [], [], []]  
                                         aten::_to_copy         0.01%     136.762us         0.40%       5.367ms     894.502us       0.000us         0.00%       4.449us       0.742us         576 B           0 B       1.50 KB           0 B             6             [[4, 6], [], [], [], [], [], []]  
                                            aten::copy_         0.01%      84.094us         0.38%       5.118ms     852.937us       4.449us         0.00%       4.449us       0.742us           0 B           0 B           0 B           0 B             6                         [[4, 6], [4, 6], []]  
                                               aten::to         0.00%      14.530us        53.40%     720.184ms     120.031ms       0.000us         0.00%       4.415us       0.736us         432 B           0 B       1.50 KB           0 B             6                 [[3, 6], [], [], [], [], []]  
                                         aten::_to_copy         0.00%      42.957us        53.40%     720.170ms     120.028ms       0.000us         0.00%       4.415us       0.736us         432 B           0 B       1.50 KB           0 B             6             [[3, 6], [], [], [], [], [], []]  
                                            aten::copy_         0.00%      51.657us        53.39%     720.086ms     120.014ms       4.415us         0.00%       4.415us       0.736us           0 B           0 B           0 B           0 B             6                         [[3, 6], [3, 6], []]  
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.335us         0.00%       2.335us       0.389us           0 B           0 B           0 B           0 B             6                                           []  
                                       cudaLaunchKernel         0.09%       1.250ms         0.09%       1.250ms       8.864us       1.632us         0.00%       1.632us       0.012us           0 B           0 B           0 B           0 B           141                                           []  
                                    aten::empty_strided         0.04%     490.917us         0.04%     490.917us       8.613us       0.000us         0.00%       0.000us       0.000us        1008 B        1008 B       3.87 GB       3.87 GB            57                     [[], [], [], [], [], []]  
                                        cudaMemcpyAsync         0.78%      10.560ms         0.78%      10.560ms     159.993us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            66                                           []  
                                  cudaStreamSynchronize        53.02%     715.066ms        53.02%     715.066ms      59.589ms       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            12                                           []  
                                           aten::detach         0.00%       6.884us         0.00%      26.305us       8.768us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B             3                                     [[4, 6]]  
                                                 detach         0.00%      19.421us         0.00%      19.421us       6.474us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B             3                                     [[4, 6]]  
                                           aten::select         0.02%     249.029us         0.02%     312.531us       5.209us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            60                             [[4, 6], [], []]  
                                       aten::as_strided         0.00%      63.502us         0.00%      63.502us       1.058us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B            60                         [[4, 6], [], [], []]  
                                           aten::select         0.02%     290.093us         0.03%     376.459us       3.585us       0.000us         0.00%       0.000us       0.000us           0 B           0 B           0 B           0 B           105                                [[6], [], []]  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------  
Self CPU time total: 1.349s
Self CUDA time total: 1.345s
