LICENSE
NOTICE
README.md
pyproject.toml
setup.cfg
mlstm_kernels/__init__.py
mlstm_kernels.egg-info/PKG-INFO
mlstm_kernels.egg-info/SOURCES.txt
mlstm_kernels.egg-info/dependency_links.txt
mlstm_kernels.egg-info/requires.txt
mlstm_kernels.egg-info/top_level.txt
mlstm_kernels/baselines/__init__.py
mlstm_kernels/baselines/flash_attention/__init__.py
mlstm_kernels/baselines/flash_attention/flash_attention_triton.py
mlstm_kernels/baselines/flash_attention/torch_sdp_attention.py
mlstm_kernels/baselines/flash_attention/triton_tutorial.py
mlstm_kernels/baselines/flash_linear_attention/__init__.py
mlstm_kernels/baselines/flash_linear_attention/fla_utils.py
mlstm_kernels/baselines/flash_linear_attention/gla/__init__.py
mlstm_kernels/baselines/flash_linear_attention/gla/chunk.py
mlstm_kernels/baselines/flash_linear_attention/gla/chunk_fuse.py
mlstm_kernels/baselines/flash_linear_attention/gla/chunk_util.py
mlstm_kernels/baselines/flash_linear_attention/gla/naive.py
mlstm_kernels/baselines/flash_linear_attention/gla/recurrent_fuse.py
mlstm_kernels/baselines/flash_linear_attention/simple_gla/__init__.py
mlstm_kernels/baselines/flash_linear_attention/simple_gla/chunk.py
mlstm_kernels/baselines/flash_linear_attention/simple_gla/naive.py
mlstm_kernels/baselines/lightning_attention/__init__.py
mlstm_kernels/baselines/lightning_attention/lightning_attn2.py
mlstm_kernels/baselines/lightning_attention/utils.py
mlstm_kernels/jax/__init__.py
mlstm_kernels/jax/stride_utils.py
mlstm_kernels/jax/utils.py
mlstm_kernels/jax/xla_utils.py
mlstm_kernels/jax/chunkwise/__init__.py
mlstm_kernels/jax/chunkwise/native/__init__.py
mlstm_kernels/jax/chunkwise/native/fw.py
mlstm_kernels/jax/chunkwise/native/fwbw.py
mlstm_kernels/jax/chunkwise/triton_limit_chunk/__init__.py
mlstm_kernels/jax/chunkwise/triton_limit_chunk/bw.py
mlstm_kernels/jax/chunkwise/triton_limit_chunk/bw_parallel.py
mlstm_kernels/jax/chunkwise/triton_limit_chunk/bw_recurrent.py
mlstm_kernels/jax/chunkwise/triton_limit_chunk/fw.py
mlstm_kernels/jax/chunkwise/triton_limit_chunk/fw_parallel.py
mlstm_kernels/jax/chunkwise/triton_limit_chunk/fw_recurrent.py
mlstm_kernels/jax/chunkwise/triton_limit_chunk/fwbw.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/__init__.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/bw.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/bw_parallel_dK.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/bw_parallel_dQ.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/bw_parallel_dV.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/bw_recurrent.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/chunkwise_gates.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/fw.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/fw_parallel.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/fw_recurrent.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk/fwbw.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/__init__.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/bw.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/bw_parallel_dK.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/bw_parallel_dQ.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/bw_parallel_dV.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/bw_recurrent.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/chunkwise_gates.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/fw.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/fw_parallel.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/fw_recurrent.py
mlstm_kernels/jax/chunkwise/triton_xl_chunk_siging/fwbw.py
mlstm_kernels/jax/parallel/__init__.py
mlstm_kernels/jax/parallel/native/__init__.py
mlstm_kernels/jax/parallel/native/bw.py
mlstm_kernels/jax/parallel/native/fw.py
mlstm_kernels/jax/parallel/native/fwbw.py
mlstm_kernels/jax/parallel/native_siging/__init__.py
mlstm_kernels/jax/parallel/native_siging/bw.py
mlstm_kernels/jax/parallel/native_siging/fw.py
mlstm_kernels/jax/parallel/native_siging/fwbw.py
mlstm_kernels/jax/parallel/native_stablef/__init__.py
mlstm_kernels/jax/parallel/native_stablef/bw.py
mlstm_kernels/jax/parallel/native_stablef/fw.py
mlstm_kernels/jax/parallel/native_stablef/fwbw.py
mlstm_kernels/jax/recurrent/__init__.py
mlstm_kernels/jax/recurrent/native_sequence.py
mlstm_kernels/jax/recurrent/native_sequence_scan.py
mlstm_kernels/jax/recurrent/native_step.py
mlstm_kernels/jax/recurrent/triton_step.py
mlstm_kernels/torch/__init__.py
mlstm_kernels/torch/backend_module.py
mlstm_kernels/torch/kernel_wrappers.py
mlstm_kernels/torch/utils.py
mlstm_kernels/torch/chunkwise/__init__.py
mlstm_kernels/torch/chunkwise/native/__init__.py
mlstm_kernels/torch/chunkwise/native/bw.py
mlstm_kernels/torch/chunkwise/native/fw.py
mlstm_kernels/torch/chunkwise/native/fwbw.py
mlstm_kernels/torch/chunkwise/triton_limit_chunk/__init__.py
mlstm_kernels/torch/chunkwise/triton_limit_chunk/bw.py
mlstm_kernels/torch/chunkwise/triton_limit_chunk/bw_parallel.py
mlstm_kernels/torch/chunkwise/triton_limit_chunk/bw_recurrent.py
mlstm_kernels/torch/chunkwise/triton_limit_chunk/chunkwise_gates.py
mlstm_kernels/torch/chunkwise/triton_limit_chunk/fw.py
mlstm_kernels/torch/chunkwise/triton_limit_chunk/fw_parallel.py
mlstm_kernels/torch/chunkwise/triton_limit_chunk/fw_recurrent.py
mlstm_kernels/torch/chunkwise/triton_limit_chunk/fwbw.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/__init__.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/bw.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/bw_parallel_dK.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/bw_parallel_dQ.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/bw_parallel_dV.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/bw_recurrent.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/chunkwise_gates.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/fw.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/fw_parallel.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/fw_recurrent.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk/fwbw.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/__init__.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/bw.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/bw_parallel_dK.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/bw_parallel_dQ.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/bw_parallel_dV.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/bw_recurrent.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/chunkwise_gates.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/fw.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/fw_parallel.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/fw_recurrent.py
mlstm_kernels/torch/chunkwise/triton_xl_chunk_siging/fwbw.py
mlstm_kernels/torch/parallel/__init__.py
mlstm_kernels/torch/parallel/_native_tiled.py
mlstm_kernels/torch/parallel/_legacy_native_siging/__init__.py
mlstm_kernels/torch/parallel/_legacy_native_siging/ops.py
mlstm_kernels/torch/parallel/_legacy_native_siging/sig_ingate.py
mlstm_kernels/torch/parallel/native/__init__.py
mlstm_kernels/torch/parallel/native/bw.py
mlstm_kernels/torch/parallel/native/fw.py
mlstm_kernels/torch/parallel/native/fwbw.py
mlstm_kernels/torch/parallel/native_siging/__init__.py
mlstm_kernels/torch/parallel/native_siging/bw.py
mlstm_kernels/torch/parallel/native_siging/fw.py
mlstm_kernels/torch/parallel/native_siging/fwbw.py
mlstm_kernels/torch/parallel/native_stablef/__init__.py
mlstm_kernels/torch/parallel/native_stablef/bw.py
mlstm_kernels/torch/parallel/native_stablef/fw.py
mlstm_kernels/torch/parallel/native_stablef/fwbw.py
mlstm_kernels/torch/parallel/triton_limit_headdim/__init__.py
mlstm_kernels/torch/parallel/triton_limit_headdim/bw.py
mlstm_kernels/torch/parallel/triton_limit_headdim/fw.py
mlstm_kernels/torch/parallel/triton_limit_headdim/fwbw.py
mlstm_kernels/torch/recurrent/__init__.py
mlstm_kernels/torch/recurrent/native_sequence.py
mlstm_kernels/torch/recurrent/native_step.py
mlstm_kernels/torch/recurrent/triton_step.py
mlstm_kernels/torch/recurrent/triton_step_alternate.py
mlstm_kernels/triton/__init__.py
mlstm_kernels/triton/kernel_param_heuristics.py
mlstm_kernels/triton/chunkwise/__init__.py
mlstm_kernels/triton/chunkwise/kernel_param_heuristics.py
mlstm_kernels/triton/chunkwise/limit_chunk/__init__.py
mlstm_kernels/triton/chunkwise/limit_chunk/bw_kernel_parallel.py
mlstm_kernels/triton/chunkwise/limit_chunk/bw_kernel_recurrent.py
mlstm_kernels/triton/chunkwise/limit_chunk/fw_kernel_parallel.py
mlstm_kernels/triton/chunkwise/limit_chunk/fw_kernel_recurrent.py
mlstm_kernels/triton/chunkwise/xl_chunk/__init__.py
mlstm_kernels/triton/chunkwise/xl_chunk/bw_kernel_parallel_dK.py
mlstm_kernels/triton/chunkwise/xl_chunk/bw_kernel_parallel_dQ.py
mlstm_kernels/triton/chunkwise/xl_chunk/bw_kernel_parallel_dV.py
mlstm_kernels/triton/chunkwise/xl_chunk/bw_kernel_recurrent.py
mlstm_kernels/triton/chunkwise/xl_chunk/fw_kernel_parallel.py
mlstm_kernels/triton/chunkwise/xl_chunk/fw_kernel_recurrent.py
mlstm_kernels/triton/chunkwise/xl_chunk_siging/__init__.py
mlstm_kernels/triton/chunkwise/xl_chunk_siging/bw_kernel_parallel_dK.py
mlstm_kernels/triton/chunkwise/xl_chunk_siging/bw_kernel_parallel_dQ.py
mlstm_kernels/triton/chunkwise/xl_chunk_siging/bw_kernel_parallel_dV.py
mlstm_kernels/triton/chunkwise/xl_chunk_siging/bw_kernel_recurrent.py
mlstm_kernels/triton/chunkwise/xl_chunk_siging/fw_kernel_parallel.py
mlstm_kernels/triton/chunkwise/xl_chunk_siging/fw_kernel_recurrent.py
mlstm_kernels/triton/parallel/__init__.py
mlstm_kernels/triton/parallel/limit_headdim/__init__.py
mlstm_kernels/triton/parallel/limit_headdim/bw_kernel.py
mlstm_kernels/triton/parallel/limit_headdim/fw_kernel.py
mlstm_kernels/triton/recurrent/__init__.py
mlstm_kernels/triton/recurrent/fw_step_alternate.py
mlstm_kernels/triton/recurrent/fw_step_fused.py
mlstm_kernels/utils/__init__.py
mlstm_kernels/utils/kernels.py
mlstm_kernels/utils/time.py
mlstm_kernels/utils/analysis/__init__.py
mlstm_kernels/utils/analysis/roofline_analysis/__init__.py
mlstm_kernels/utils/analysis/roofline_analysis/flops_mlstm.py
mlstm_kernels/utils/analysis/roofline_analysis/memops_mlstm.py
mlstm_kernels/utils/analysis/roofline_analysis/plot_config.py
mlstm_kernels/utils/analysis/roofline_analysis/plot_mlstm_arithmetic_intensity.py
mlstm_kernels/utils/analysis/roofline_analysis/plot_mlstm_flop_analysis.py
mlstm_kernels/utils/analysis/roofline_analysis/plot_mlstm_optimal_chunksize.py
mlstm_kernels/utils/analysis/roofline_analysis/plot_roofline_model.py
mlstm_kernels/utils/analysis/roofline_analysis/plot_runtime.py
mlstm_kernels/utils/analysis/roofline_analysis/roofline_analysis_mlstm.py
mlstm_kernels/utils/analysis/transfer_behavior/__init__.py
mlstm_kernels/utils/analysis/transfer_behavior/_mlstm_cells.py
mlstm_kernels/utils/analysis/transfer_behavior/_norm_layers.py
mlstm_kernels/utils/analysis/transfer_behavior/generate_transfer_behavior_data.py
mlstm_kernels/utils/analysis/transfer_behavior/mlstm_cell_func.py
mlstm_kernels/utils/analysis/transfer_behavior/plot_transfer_behavior.py
mlstm_kernels/utils/benchmark/__init__.py
mlstm_kernels/utils/benchmark/cuda_graphs.py
mlstm_kernels/utils/benchmark/param_handling.py
mlstm_kernels/utils/benchmark/plot_config.py
mlstm_kernels/utils/benchmark/plot_results.py
mlstm_kernels/utils/benchmark/run_benchmark.py
mlstm_kernels/utils/benchmark/runtime.py
mlstm_kernels/utils/benchmark/utils.py
mlstm_kernels/utils/flops/__init__.py
mlstm_kernels/utils/flops/mlstm_block_flop_counts.py
mlstm_kernels/utils/flops/mlstm_flop_analysis.py
mlstm_kernels/utils/flops/model_flops_computation.py
mlstm_kernels/utils/flops/slstm_block_flop_counts.py
mlstm_kernels/utils/flops/transformer_block_flop_counts.py
mlstm_kernels/utils/plot/__init__.py
mlstm_kernels/utils/plot/bar_plot.py
mlstm_kernels/utils/plot/diff_imshow.py
mlstm_kernels/utils/plot/diff_lineplot.py
mlstm_kernels/utils/plot/ewma.py
mlstm_kernels/utils/test/__init__.py
mlstm_kernels/utils/test/checks.py
mlstm_kernels/utils/test/fixtures.py
mlstm_kernels/utils/test/test_fwbw.py
mlstm_kernels/utils/test/test_templates/__init__.py
tests/test_padding.py