MANIFEST.in
README.md
config_vsa.py
setup.py
vsa.cpp
tk/.git
tk/.gitignore
tk/Doxyfile
tk/LICENSE
tk/README.md
tk/config.py
tk/env.src
tk/setup.py
tk/thunderkittens.cpp
tk/assets/attn.png
tk/assets/thunderkittens.png
tk/demos/README.md
tk/demos/requirements.txt
tk/demos/assets/kittens.png
tk/demos/based_demo/README.md
tk/demos/based_demo/document_ie_based.py
tk/demos/based_demo/generate_based.py
tk/demos/based_demo/based/__init__.py
tk/demos/based_demo/based/models/__init__.py
tk/demos/based_demo/based/models/layer_norm.py
tk/demos/based_demo/based/models/mixers/__init__.py
tk/demos/based_demo/based/models/mixers/convolution.py
tk/demos/based_demo/based/models/mixers/linear_attention.py
tk/demos/based_demo/based/models/mixers/rotary.py
tk/demos/based_demo/based/models/mixers/slide_attention.py
tk/demos/based_demo/plots/benchmark-lin-attn-fwd-B16.png
tk/demos/based_demo/plots/benchmark-lin-attn-fwd-B4.png
tk/demos/based_demo/plots/benchmark-lin-attn-fwd-L8192.png
tk/demos/based_demo/plots/benchmark_input1000_output1.png
tk/demos/based_demo/plots/benchmark_input16000_output1.png
tk/demos/based_demo/plots/benchmark_input8000_output1.png
tk/demos/based_demo/train/src/__init__.py
tk/demos/based_demo/train/src/generation.py
tk/demos/based_demo/train/src/models/__init__.py
tk/demos/based_demo/train/src/models/block.py
tk/demos/based_demo/train/src/models/embeddings.py
tk/demos/based_demo/train/src/models/gpt.py
tk/demos/based_demo/train/src/models/mha.py
tk/demos/based_demo/train/src/models/mlp.py
tk/demos/based_demo/train/src/utils/hf.py
tk/demos/based_demo/train/src/utils/utils.py
tk/demos/configs/llama_3.1_8b_distill_config.yaml
tk/demos/configs/llama_3.1_8b_finetune_config.yaml
tk/demos/configs/llama_3.1_8b_model_config.yaml
tk/demos/llama_demo/demo_8b.sh
tk/demos/llama_demo/demo_llama_hf.py
tk/demos/llama_demo/src/__init__.py
tk/demos/llama_demo/src/model/__init__.py
tk/demos/llama_demo/src/model/pretrained.py
tk/demos/llama_demo/src/model/transformers_modeling_llama.py
tk/demos/llama_demo/src/model/transformers_modeling_utils.py
tk/demos/llama_demo/src/utils/__init__.py
tk/demos/llama_demo/src/utils/logging.py
tk/demos/llama_demo/src/utils/setup.py
tk/demos/lolcats_demo/demo_8b.sh
tk/demos/lolcats_demo/demo_lolcats_hf.py
tk/demos/lolcats_demo/src/__init__.py
tk/demos/lolcats_demo/src/__pycache__/__init__.cpython-311.pyc
tk/demos/lolcats_demo/src/model/__init__.py
tk/demos/lolcats_demo/src/model/convert_model.py
tk/demos/lolcats_demo/src/model/feature_map.py
tk/demos/lolcats_demo/src/model/load_model.py
tk/demos/lolcats_demo/src/model/modeling_llama.py
tk/demos/lolcats_demo/src/model/peft.py
tk/demos/lolcats_demo/src/model/pretrained.py
tk/demos/lolcats_demo/src/model/rotary.py
tk/demos/lolcats_demo/src/model/utils.py
tk/demos/lolcats_demo/src/model/linear_attention/__init__.py
tk/demos/lolcats_demo/src/model/linear_attention/linear_attention.py
tk/demos/lolcats_demo/src/model/linear_attention/linear_window_attention_tk.py
tk/demos/lolcats_demo/src/model/linear_attention/linear_window_attention_tk_gen.py
tk/demos/lolcats_demo/src/model/linear_attention/linear_window_attention_tk_long.py
tk/demos/lolcats_demo/src/model/linear_attention/utils.py
tk/demos/lolcats_demo/src/utils/__init__.py
tk/demos/lolcats_demo/src/utils/logging.py
tk/demos/lolcats_demo/src/utils/setup.py
tk/demos/qwen_demo/demo_8b.sh
tk/demos/qwen_demo/demo_qwen_hf.py
tk/demos/qwen_demo/src/__init__.py
tk/demos/qwen_demo/src/model/__init__.py
tk/demos/qwen_demo/src/model/pretrained.py
tk/demos/qwen_demo/src/model/transformers_modeling_qwen.py
tk/demos/qwen_demo/src/model/transformers_modeling_utils.py
tk/demos/qwen_demo/src/utils/__init__.py
tk/demos/qwen_demo/src/utils/logging.py
tk/demos/qwen_demo/src/utils/setup.py
tk/docs/conda_setup.md
tk/include/kittens.cuh
tk/include/common/base_ops.cuh
tk/include/common/base_types.cuh
tk/include/common/common.cuh
tk/include/common/debug.cuh
tk/include/common/util.cuh
tk/include/ops/ops.cuh
tk/include/ops/gang/gang.cuh
tk/include/ops/group/group.cuh
tk/include/ops/group/memory/memory.cuh
tk/include/ops/group/memory/tile/global_to_register.cuh
tk/include/ops/group/memory/tile/global_to_shared.cuh
tk/include/ops/group/memory/tile/pgl_to_register.cuh
tk/include/ops/group/memory/tile/pgl_to_shared.cuh
tk/include/ops/group/memory/tile/shared_to_register.cuh
tk/include/ops/group/memory/tile/tile.cuh
tk/include/ops/group/memory/tile/complex/complex_global_to_register.cuh
tk/include/ops/group/memory/tile/complex/complex_global_to_shared.cuh
tk/include/ops/group/memory/tile/complex/complex_shared_to_register.cuh
tk/include/ops/group/memory/util/util.cuh
tk/include/ops/group/memory/vec/global_to_register.cuh
tk/include/ops/group/memory/vec/global_to_shared.cuh
tk/include/ops/group/memory/vec/pgl_to_register.cuh
tk/include/ops/group/memory/vec/pgl_to_shared.cuh
tk/include/ops/group/memory/vec/shared_to_register.cuh
tk/include/ops/group/memory/vec/vec.cuh
tk/include/ops/group/register/register.cuh
tk/include/ops/group/register/tile/conversions.cuh
tk/include/ops/group/register/tile/tile.cuh
tk/include/ops/group/shared/shared.cuh
tk/include/ops/group/shared/tile/conversions.cuh
tk/include/ops/group/shared/tile/maps.cuh
tk/include/ops/group/shared/tile/reductions.cuh
tk/include/ops/group/shared/tile/tile.cuh
tk/include/ops/group/shared/vec/conversions.cuh
tk/include/ops/group/shared/vec/maps.cuh
tk/include/ops/group/shared/vec/vec.cuh
tk/include/ops/group/wgmma/wgmma.cuh
tk/include/ops/group/wgmma/base/64x112.impl
tk/include/ops/group/wgmma/base/64x128.impl
tk/include/ops/group/wgmma/base/64x144.impl
tk/include/ops/group/wgmma/base/64x16.impl
tk/include/ops/group/wgmma/base/64x160.impl
tk/include/ops/group/wgmma/base/64x176.impl
tk/include/ops/group/wgmma/base/64x192.impl
tk/include/ops/group/wgmma/base/64x208.impl
tk/include/ops/group/wgmma/base/64x224.impl
tk/include/ops/group/wgmma/base/64x240.impl
tk/include/ops/group/wgmma/base/64x256.impl
tk/include/ops/group/wgmma/base/64x32.impl
tk/include/ops/group/wgmma/base/64x48.impl
tk/include/ops/group/wgmma/base/64x64.impl
tk/include/ops/group/wgmma/base/64x80.impl
tk/include/ops/group/wgmma/base/64x96.impl
tk/include/ops/group/wgmma/base/base.cuh
tk/include/ops/warp/warp.cuh
tk/include/ops/warp/memory/memory.cuh
tk/include/ops/warp/memory/tile/global_to_register.cuh
tk/include/ops/warp/memory/tile/global_to_shared.cuh
tk/include/ops/warp/memory/tile/pgl_to_register.cuh
tk/include/ops/warp/memory/tile/pgl_to_shared.cuh
tk/include/ops/warp/memory/tile/shared_to_register.cuh
tk/include/ops/warp/memory/tile/tile.cuh
tk/include/ops/warp/memory/tile/tma.cuh
tk/include/ops/warp/memory/tile/complex/complex_global_to_register.cuh
tk/include/ops/warp/memory/tile/complex/complex_global_to_shared.cuh
tk/include/ops/warp/memory/tile/complex/complex_shared_to_register.cuh
tk/include/ops/warp/memory/util/reduce.cuh
tk/include/ops/warp/memory/util/tma.cuh
tk/include/ops/warp/memory/util/util.cuh
tk/include/ops/warp/memory/vec/global_to_register.cuh
tk/include/ops/warp/memory/vec/global_to_shared.cuh
tk/include/ops/warp/memory/vec/pgl_to_register.cuh
tk/include/ops/warp/memory/vec/pgl_to_shared.cuh
tk/include/ops/warp/memory/vec/shared_to_register.cuh
tk/include/ops/warp/memory/vec/tma.cuh
tk/include/ops/warp/memory/vec/vec.cuh
tk/include/ops/warp/register/register.cuh
tk/include/ops/warp/register/tile/conversions.cuh
tk/include/ops/warp/register/tile/maps.cuh
tk/include/ops/warp/register/tile/mma.cuh
tk/include/ops/warp/register/tile/reductions.cuh
tk/include/ops/warp/register/tile/tile.cuh
tk/include/ops/warp/register/tile/complex/complex_conversions.cuh
tk/include/ops/warp/register/tile/complex/complex_maps.cuh
tk/include/ops/warp/register/tile/complex/complex_mma.cuh
tk/include/ops/warp/register/vec/conversions.cuh
tk/include/ops/warp/register/vec/maps.cuh
tk/include/ops/warp/register/vec/reductions.cuh
tk/include/ops/warp/register/vec/vec.cuh
tk/include/ops/warp/shared/shared.cuh
tk/include/ops/warp/shared/tile/conversions.cuh
tk/include/ops/warp/shared/tile/maps.cuh
tk/include/ops/warp/shared/tile/reductions.cuh
tk/include/ops/warp/shared/tile/tile.cuh
tk/include/ops/warp/shared/vec/conversions.cuh
tk/include/ops/warp/shared/vec/maps.cuh
tk/include/ops/warp/shared/vec/reductions.cuh
tk/include/ops/warp/shared/vec/vec.cuh
tk/include/pyutils/club.cuh
tk/include/pyutils/pyutils.cuh
tk/include/pyutils/torch_helpers.cuh
tk/include/pyutils/util.cuh
tk/include/types/types.cuh
tk/include/types/device/device.cuh
tk/include/types/device/pgl.cuh
tk/include/types/device/sync_manager.cuh
tk/include/types/device/detail/helpers.cuh
tk/include/types/global/cgl.cuh
tk/include/types/global/gl.cuh
tk/include/types/global/global.cuh
tk/include/types/global/tma.cuh
tk/include/types/global/util.cuh
tk/include/types/register/crt.cuh
tk/include/types/register/crv.cuh
tk/include/types/register/register.cuh
tk/include/types/register/rt.cuh
tk/include/types/register/rt_base.cuh
tk/include/types/register/rt_layout.cuh
tk/include/types/register/rv.cuh
tk/include/types/register/rv_layout.cuh
tk/include/types/shared/cst.cuh
tk/include/types/shared/csv.cuh
tk/include/types/shared/shared.cuh
tk/include/types/shared/st.cuh
tk/include/types/shared/sv.cuh
tk/kernels/all_reduce/Makefile
tk/kernels/all_reduce/all_reduce.cu
tk/kernels/attn/demo/4090.cu
tk/kernels/attn/demo/4090_harness.impl
tk/kernels/attn/demo/Makefile
tk/kernels/attn/demo/README.md
tk/kernels/attn/demo/fa2_bench.py
tk/kernels/attn/demo/gentests.py
tk/kernels/attn/demo/h100_lcf.cu
tk/kernels/attn/demo/h100_lcf_harness.impl
tk/kernels/attn/demo/gqa_decode/Makefile
tk/kernels/attn/demo/gqa_decode/README.md
tk/kernels/attn/demo/gqa_decode/benchmark_fa3_gqa_decode.py
tk/kernels/attn/demo/gqa_decode/scheduler.py
tk/kernels/attn/demo/gqa_decode/scheduler_regression.py
tk/kernels/attn/demo/gqa_decode/scheduler_v2.py
tk/kernels/attn/demo/gqa_decode/template_gqa_decode.cu
tk/kernels/attn/demo/gqa_decode/template_gqa_decode_new.cu
tk/kernels/attn/demo/gqa_decode/test.py
tk/kernels/attn/demo/gqa_decode/timings.py
tk/kernels/attn/demo/mha_decode/Makefile
tk/kernels/attn/demo/mha_decode/bench.py
tk/kernels/attn/demo/mha_decode/benchmark_fa3_mha_decode.py
tk/kernels/attn/demo/mha_decode/scheduler.py
tk/kernels/attn/demo/mha_decode/scheduler_regression.py
tk/kernels/attn/demo/mha_decode/scheduler_v2.py
tk/kernels/attn/demo/mha_decode/template_mha_decode.cu
tk/kernels/attn/demo/mha_decode/test.py
tk/kernels/attn/demo/mha_decode/timings.py
tk/kernels/attn/demo/mla_decode/Makefile
tk/kernels/attn/demo/mla_decode/README.md
tk/kernels/attn/demo/mla_decode/scheduler.py
tk/kernels/attn/demo/mla_decode/scheduler_regression.py
tk/kernels/attn/demo/mla_decode/scheduler_v2.py
tk/kernels/attn/demo/mla_decode/template_mla_decode.cu
tk/kernels/attn/demo/mla_decode/test.py
tk/kernels/attn/demo/mla_decode/timings.py
tk/kernels/attn/h100/Makefile
tk/kernels/attn/h100/check.py
tk/kernels/attn/h100/gentests.py
tk/kernels/attn/h100/h100.cu
tk/kernels/attn/h100/h100_bench.py
tk/kernels/attn/h100/h100_check.py
tk/kernels/attn/h100/hammer_bwd.py
tk/kernels/attn/h100/hammer_fwd.py
tk/kernels/attn/h100/hammer_fwd_bwd.py
tk/kernels/attn/h100/harness.impl
tk/kernels/based/Makefile
tk/kernels/based/generate_tests.py
tk/kernels/based/harness_4090.impl
tk/kernels/based/harness_h100.impl
tk/kernels/based/lin_attn_4090.cu
tk/kernels/based/lin_attn_h100.cu
tk/kernels/based/lcsf/linear_prefill.cu
tk/kernels/example_bind/Makefile
tk/kernels/example_bind/README.md
tk/kernels/example_bind/example_bind.cu
tk/kernels/example_bind/run_from_python.py
tk/kernels/fftconv/long/pytorch_ref.py
tk/kernels/fftconv/non_pc/Makefile
tk/kernels/fftconv/non_pc/fftconv_tk.cu
tk/kernels/fftconv/non_pc/harness_async.impl
tk/kernels/fftconv/pc/Makefile
tk/kernels/fftconv/pc/generate_tests.py
tk/kernels/fftconv/pc/generate_tests_1024.py
tk/kernels/fftconv/pc/harness_async.impl
tk/kernels/fftconv/pc/pc.cu
tk/kernels/flux/Makefile
tk/kernels/flux/flux_gate.cu
tk/kernels/flux/flux_gelu.cu
tk/kernels/flux/test_flux.py
tk/kernels/hedgehog/Makefile
tk/kernels/hedgehog/README.md
tk/kernels/hedgehog/gentests.py
tk/kernels/hedgehog/harness.impl
tk/kernels/hedgehog/hh.cu
tk/kernels/layernorm/non_pc/Makefile
tk/kernels/layernorm/non_pc/gentests.py
tk/kernels/layernorm/non_pc/harness.impl
tk/kernels/layernorm/non_pc/layer_norm.cu
tk/kernels/mamba2/Makefile
tk/kernels/mamba2/gentests.py
tk/kernels/mamba2/harness.impl
tk/kernels/mamba2/harness2.impl
tk/kernels/mamba2/harness3.impl
tk/kernels/mamba2/pc.cu
tk/kernels/matmul/FP8/Makefile
tk/kernels/matmul/FP8/matmul.cu
tk/kernels/matmul/H100/Makefile
tk/kernels/matmul/H100/matmul.cu
tk/kernels/matmul/H100_mma_ABt/Makefile
tk/kernels/matmul/H100_mma_ABt/matmul.cu
tk/kernels/matmul/baselines/cublas/Makefile
tk/kernels/matmul/baselines/cublas/matmul.cu
tk/kernels/matmul/baselines/cublas_fp8/Makefile
tk/kernels/matmul/baselines/cublas_fp8/matmul.cu
tk/kernels/matmul/baselines/cublas_lt/Makefile
tk/kernels/matmul/baselines/cublas_lt/matmul.cu
tk/kernels/matmul/educational/Makefile
tk/kernels/matmul/educational/README.md
tk/kernels/matmul/educational/launch.cu
tk/kernels/matmul/educational/level_01.cu
tk/kernels/matmul/educational/level_02.cu
tk/kernels/matmul/educational/level_03.cu
tk/kernels/matmul/educational/level_04.cu
tk/kernels/matmul/educational/level_05.cu
tk/kernels/matmul/educational/level_06.cu
tk/kernels/matmul/educational/level_07.cu
tk/kernels/matmul/educational/level_08.cu
tk/kernels/ring_attention/Makefile
tk/kernels/ring_attention/setup.py
tk/kernels/ring_attention/tk_ring_attention.cu
tk/kernels/rotary/Makefile
tk/kernels/rotary/gentests.py
tk/kernels/rotary/harness.impl
tk/kernels/rotary/harness2.impl
tk/kernels/rotary/pc.cu
tk/kernels/torch_scaled/Makefile
tk/kernels/torch_scaled/gentests.py
tk/kernels/torch_scaled/harness.impl
tk/kernels/torch_scaled/scaled_matmul.cu
tk/kernels/torch_scaled/visualize.py
tk/prototype/prototype.cuh
tk/prototype/common/common.cuh
tk/prototype/common/templates.cuh
tk/prototype/common/util.cuh
tk/prototype/interpreter/Makefile
tk/prototype/interpreter/interpreter.cuh
tk/prototype/interpreter/templates.cuh
tk/prototype/interpreter/test.cu
tk/prototype/lcf/lcf.cuh
tk/prototype/lcf/templates.cuh
tk/prototype/lcsf/lcsf.cuh
tk/prototype/lcsf/templates.cuh
tk/tests/python/README.md
tk/tests/python/benchmark_kernels.py
tk/tests/python/utils.py
tk/tests/python/attention/implementations.py
tk/tests/python/attention/test_correctness.py
tk/tests/python/based_attention/implementations.py
tk/tests/python/based_attention/test_correctness.py
tk/tests/python/fftconv/implementations.py
tk/tests/python/fftconv/test_correctness.py
tk/tests/python/fftconv/baselines/tk_fftconv.py
tk/tests/python/hedgehog/implementations.py
tk/tests/python/hedgehog/test_correctness.py
tk/tests/python/hedgehog/util.py
tk/tests/python/layernorm/benchmark.py
tk/tests/python/layernorm/implementations.py
tk/tests/python/layernorm/test_correctness.py
tk/tests/python/layernorm/baselines/layer_norm_triton.py
tk/tests/python/mamba2/implementations.py
tk/tests/python/mamba2/test_correctness.py
tk/tests/python/mamba2/baselines/ssd_minimal.py
tk/tests/python/matmul/test_fp8.py
tk/tests/python/ring_attention/benchmark.py
tk/tests/python/ring_attention/implementations.py
tk/tests/python/ring_attention/test_correctness.py
tk/tests/python/ring_attention/original_ring_attention/__init__.py
tk/tests/python/ring_attention/original_ring_attention/ringattention_inference.py
tk/tests/python/ring_attention/original_ring_attention/ringattention_jax.py
tk/tests/python/ring_attention/original_ring_attention/ringattention_pallas_gpu.py
tk/tests/python/ring_attention/original_ring_attention/ringattention_pallas_tpu.py
tk/tests/python/rotary/implementations.py
tk/tests/python/rotary/test_correctness.py
tk/tests/python/rotary/baselines/rotary.py
tk/tests/python/rotary/baselines/triton_rotary.py
tk/tests/unit/Makefile
tk/tests/unit/README.md
tk/tests/unit/unit_tests.cu
tk/tests/unit/gang/gang.cu
tk/tests/unit/gang/gang.cuh
tk/tests/unit/group/group.cu
tk/tests/unit/group/group.cuh
tk/tests/unit/group/memory/memory.cu
tk/tests/unit/group/memory/memory.cuh
tk/tests/unit/group/memory/tile/global_to_register.cu
tk/tests/unit/group/memory/tile/global_to_register.cuh
tk/tests/unit/group/memory/tile/global_to_shared.cu
tk/tests/unit/group/memory/tile/global_to_shared.cuh
tk/tests/unit/group/memory/tile/pgl_to_register.cu
tk/tests/unit/group/memory/tile/pgl_to_register.cuh
tk/tests/unit/group/memory/tile/pgl_to_shared.cu
tk/tests/unit/group/memory/tile/pgl_to_shared.cuh
tk/tests/unit/group/memory/tile/shared_to_register.cu
tk/tests/unit/group/memory/tile/shared_to_register.cuh
tk/tests/unit/group/memory/tile/tile.cu
tk/tests/unit/group/memory/tile/tile.cuh
tk/tests/unit/group/memory/vec/global_to_register.cu
tk/tests/unit/group/memory/vec/global_to_register.cuh
tk/tests/unit/group/memory/vec/global_to_shared.cu
tk/tests/unit/group/memory/vec/global_to_shared.cuh
tk/tests/unit/group/memory/vec/pgl_to_register.cu
tk/tests/unit/group/memory/vec/pgl_to_register.cuh
tk/tests/unit/group/memory/vec/pgl_to_shared.cu
tk/tests/unit/group/memory/vec/pgl_to_shared.cuh
tk/tests/unit/group/memory/vec/shared_to_register.cu
tk/tests/unit/group/memory/vec/shared_to_register.cuh
tk/tests/unit/group/memory/vec/vec.cu
tk/tests/unit/group/memory/vec/vec.cuh
tk/tests/unit/group/shared/shared.cu
tk/tests/unit/group/shared/shared.cuh
tk/tests/unit/group/shared/tile/conversions.cu
tk/tests/unit/group/shared/tile/conversions.cuh
tk/tests/unit/group/shared/tile/maps.cu
tk/tests/unit/group/shared/tile/maps.cuh
tk/tests/unit/group/shared/tile/reductions.cu
tk/tests/unit/group/shared/tile/reductions.cuh
tk/tests/unit/group/shared/tile/tile.cu
tk/tests/unit/group/shared/tile/tile.cuh
tk/tests/unit/group/shared/vec/conversions.cu
tk/tests/unit/group/shared/vec/conversions.cuh
tk/tests/unit/group/shared/vec/maps.cu
tk/tests/unit/group/shared/vec/maps.cuh
tk/tests/unit/group/shared/vec/vec.cu
tk/tests/unit/group/shared/vec/vec.cuh
tk/tests/unit/group/wgmma/mma_fp16_fp16.cu
tk/tests/unit/group/wgmma/mma_fp16_fp16.cuh
tk/tests/unit/group/wgmma/mma_fp16_fp8.cu
tk/tests/unit/group/wgmma/mma_fp16_fp8.cuh
tk/tests/unit/group/wgmma/mma_fp32_bf16.cu
tk/tests/unit/group/wgmma/mma_fp32_bf16.cuh
tk/tests/unit/group/wgmma/mma_fp32_fp16.cu
tk/tests/unit/group/wgmma/mma_fp32_fp16.cuh
tk/tests/unit/group/wgmma/mma_fp32_fp8.cu
tk/tests/unit/group/wgmma/mma_fp32_fp8.cuh
tk/tests/unit/group/wgmma/wgmma.cu
tk/tests/unit/group/wgmma/wgmma.cuh
tk/tests/unit/group/wgmma/complex/complex_mma_fp16_fp16.cu
tk/tests/unit/group/wgmma/complex/complex_mma_fp16_fp16.cuh
tk/tests/unit/group/wgmma/complex/complex_mma_fp32_bf16.cu
tk/tests/unit/group/wgmma/complex/complex_mma_fp32_bf16.cuh
tk/tests/unit/group/wgmma/complex/complex_mma_fp32_fp16.cu
tk/tests/unit/group/wgmma/complex/complex_mma_fp32_fp16.cuh
tk/tests/unit/group/wgmma/complex/complex_wgmma.cu
tk/tests/unit/group/wgmma/complex/complex_wgmma.cuh
tk/tests/unit/testing_commons/testing_commons.cuh
tk/tests/unit/testing_commons/testing_flags.cuh
tk/tests/unit/testing_commons/testing_utils.cu
tk/tests/unit/testing_commons/testing_utils.cuh
tk/tests/unit/warp/warp.cu
tk/tests/unit/warp/warp.cuh
tk/tests/unit/warp/memory/memory.cu
tk/tests/unit/warp/memory/memory.cuh
tk/tests/unit/warp/memory/tile/dsmem.cu
tk/tests/unit/warp/memory/tile/dsmem.cuh
tk/tests/unit/warp/memory/tile/global_to_register.cu
tk/tests/unit/warp/memory/tile/global_to_register.cuh
tk/tests/unit/warp/memory/tile/global_to_shared.cu
tk/tests/unit/warp/memory/tile/global_to_shared.cuh
tk/tests/unit/warp/memory/tile/pgl_to_register.cu
tk/tests/unit/warp/memory/tile/pgl_to_register.cuh
tk/tests/unit/warp/memory/tile/pgl_to_shared.cu
tk/tests/unit/warp/memory/tile/pgl_to_shared.cuh
tk/tests/unit/warp/memory/tile/shared_to_register.cu
tk/tests/unit/warp/memory/tile/shared_to_register.cuh
tk/tests/unit/warp/memory/tile/tile.cu
tk/tests/unit/warp/memory/tile/tile.cuh
tk/tests/unit/warp/memory/tile/tma.cu
tk/tests/unit/warp/memory/tile/tma.cuh
tk/tests/unit/warp/memory/tile/tma_multicast.cu
tk/tests/unit/warp/memory/tile/tma_multicast.cuh
tk/tests/unit/warp/memory/tile/tma_pgl.cu
tk/tests/unit/warp/memory/tile/tma_pgl.cuh
tk/tests/unit/warp/memory/util/reduce.cu
tk/tests/unit/warp/memory/util/reduce.cuh
tk/tests/unit/warp/memory/util/util.cu
tk/tests/unit/warp/memory/util/util.cuh
tk/tests/unit/warp/memory/vec/dsmem.cu
tk/tests/unit/warp/memory/vec/dsmem.cuh
tk/tests/unit/warp/memory/vec/global_to_register.cu
tk/tests/unit/warp/memory/vec/global_to_register.cuh
tk/tests/unit/warp/memory/vec/global_to_shared.cu
tk/tests/unit/warp/memory/vec/global_to_shared.cuh
tk/tests/unit/warp/memory/vec/pgl_to_register.cu
tk/tests/unit/warp/memory/vec/pgl_to_register.cuh
tk/tests/unit/warp/memory/vec/pgl_to_shared.cu
tk/tests/unit/warp/memory/vec/pgl_to_shared.cuh
tk/tests/unit/warp/memory/vec/shared_to_register.cu
tk/tests/unit/warp/memory/vec/shared_to_register.cuh
tk/tests/unit/warp/memory/vec/tma.cu
tk/tests/unit/warp/memory/vec/tma.cuh
tk/tests/unit/warp/memory/vec/tma_multicast.cu
tk/tests/unit/warp/memory/vec/tma_multicast.cuh
tk/tests/unit/warp/memory/vec/tma_pgl.cu
tk/tests/unit/warp/memory/vec/tma_pgl.cuh
tk/tests/unit/warp/memory/vec/vec.cu
tk/tests/unit/warp/memory/vec/vec.cuh
tk/tests/unit/warp/register/register.cu
tk/tests/unit/warp/register/register.cuh
tk/tests/unit/warp/register/tile/conversions.cu
tk/tests/unit/warp/register/tile/conversions.cuh
tk/tests/unit/warp/register/tile/maps.cu
tk/tests/unit/warp/register/tile/maps.cuh
tk/tests/unit/warp/register/tile/mma.cu
tk/tests/unit/warp/register/tile/mma.cuh
tk/tests/unit/warp/register/tile/reductions.cu
tk/tests/unit/warp/register/tile/reductions.cuh
tk/tests/unit/warp/register/tile/tile.cu
tk/tests/unit/warp/register/tile/tile.cuh
tk/tests/unit/warp/register/tile/complex/complex_conversions.cu
tk/tests/unit/warp/register/tile/complex/complex_conversions.cuh
tk/tests/unit/warp/register/tile/complex/complex_maps.cu
tk/tests/unit/warp/register/tile/complex/complex_maps.cuh
tk/tests/unit/warp/register/tile/complex/complex_mma.cu
tk/tests/unit/warp/register/tile/complex/complex_mma.cuh
tk/tests/unit/warp/register/tile/complex/complex_mul.cu
tk/tests/unit/warp/register/tile/complex/complex_mul.cuh
tk/tests/unit/warp/register/vec/conversions.cu
tk/tests/unit/warp/register/vec/conversions.cuh
tk/tests/unit/warp/register/vec/maps.cu
tk/tests/unit/warp/register/vec/maps.cuh
tk/tests/unit/warp/register/vec/reductions.cu
tk/tests/unit/warp/register/vec/reductions.cuh
tk/tests/unit/warp/register/vec/vec.cu
tk/tests/unit/warp/register/vec/vec.cuh
tk/tests/unit/warp/shared/shared.cu
tk/tests/unit/warp/shared/shared.cuh
tk/tests/unit/warp/shared/tile/conversions.cu
tk/tests/unit/warp/shared/tile/conversions.cuh
tk/tests/unit/warp/shared/tile/maps.cu
tk/tests/unit/warp/shared/tile/maps.cuh
tk/tests/unit/warp/shared/tile/reductions.cu
tk/tests/unit/warp/shared/tile/reductions.cuh
tk/tests/unit/warp/shared/tile/tile.cu
tk/tests/unit/warp/shared/tile/tile.cuh
tk/tests/unit/warp/shared/vec/conversions.cu
tk/tests/unit/warp/shared/vec/conversions.cuh
tk/tests/unit/warp/shared/vec/maps.cu
tk/tests/unit/warp/shared/vec/maps.cuh
tk/tests/unit/warp/shared/vec/reductions.cu
tk/tests/unit/warp/shared/vec/reductions.cuh
tk/tests/unit/warp/shared/vec/vec.cu
tk/tests/unit/warp/shared/vec/vec.cuh
vsa/__init__.py
vsa/block_sparse_attn_triton.py
vsa/block_sparse_h100.cu
vsa/block_sparse_wrapper.py
vsa/index.py
vsa.egg-info/PKG-INFO
vsa.egg-info/SOURCES.txt
vsa.egg-info/dependency_links.txt
vsa.egg-info/requires.txt
vsa.egg-info/top_level.txt