AUTHORS
LICENSE
MANIFEST.in
README.md
setup.py
/home/liaojy/workspace/FA/smh/flash-attention-npu-smh/csrc/flash_attn_npu/flash_api.cpp
/home/liaojy/workspace/FA/smh/flash-attention-npu-smh/csrc/flash_attn_npu_v3/flash_api.cpp
csrc/catlass/include/catlass/catlass.hpp
csrc/catlass/include/catlass/conv_coord.hpp
csrc/catlass/include/catlass/coord.hpp
csrc/catlass/include/catlass/debug.hpp
csrc/catlass/include/catlass/gemm_coord.hpp
csrc/catlass/include/catlass/gemv_coord.hpp
csrc/catlass/include/catlass/matrix_coord.hpp
csrc/catlass/include/catlass/numeric_size.hpp
csrc/catlass/include/catlass/status.hpp
csrc/catlass/include/catlass/arch/arch.hpp
csrc/catlass/include/catlass/arch/cross_core_sync.hpp
csrc/catlass/include/catlass/arch/local_tensor_buffer.hpp
csrc/catlass/include/catlass/arch/resource.hpp
csrc/catlass/include/catlass/conv/dispatch_policy.hpp
csrc/catlass/include/catlass/conv/block/block_conv.hpp
csrc/catlass/include/catlass/conv/block/block_conv2d_pingpong.hpp
csrc/catlass/include/catlass/conv/block/block_conv3d_pingpong_bias.hpp
csrc/catlass/include/catlass/conv/block/block_swizzle.hpp
csrc/catlass/include/catlass/conv/device/device_conv.hpp
csrc/catlass/include/catlass/conv/kernel/basic_conv2d.hpp
csrc/catlass/include/catlass/conv/kernel/conv3d_bias.hpp
csrc/catlass/include/catlass/conv/tile/copy_gm_to_l1.hpp
csrc/catlass/include/catlass/conv/tile/copy_l0c_to_gm.hpp
csrc/catlass/include/catlass/conv/tile/copy_l1_to_l0a.hpp
csrc/catlass/include/catlass/conv/tile/copy_l1_to_l0b.hpp
csrc/catlass/include/catlass/conv/tile/tile_copy.hpp
csrc/catlass/include/catlass/detail/alignment.hpp
csrc/catlass/include/catlass/detail/callback.hpp
csrc/catlass/include/catlass/detail/dependent_false.hpp
csrc/catlass/include/catlass/detail/kernel_adapter.hpp
csrc/catlass/include/catlass/detail/macros.hpp
csrc/catlass/include/catlass/epilogue/dispatch_policy.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_elemwise_no_source.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_elemwise_one_source.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_fa_rescale_o.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_fa_softmax.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_gemm.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_gemv.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_mla_fd_rescale_o.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_mla_rescale_o.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_mla_softmax.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_mla_tp1_rescale_o.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_mla_tp1_softmax.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_online_softmax_no_mask.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_per_token_dequant.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_rescale_o_no_split_row.hpp
csrc/catlass/include/catlass/epilogue/block/block_epilogue_w4a4_per_token_per_channel_dequant.hpp
csrc/catlass/include/catlass/epilogue/tile/copy_gm_to_ub.hpp
csrc/catlass/include/catlass/epilogue/tile/copy_ub_to_gm.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_broadcast_add.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_broadcast_inplace_by_column.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_broadcast_inplace_by_row.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_broadcast_mul.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_broadcast_one_blk.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_cast.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_copy.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_elemwise_add.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_elemwise_gelu.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_elemwise_mul.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_elemwise_muls.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_elemwise_silu.hpp
csrc/catlass/include/catlass/epilogue/tile/tile_swizzle.hpp
csrc/catlass/include/catlass/gemm/dispatch_policy.hpp
csrc/catlass/include/catlass/gemm/gemm_type.hpp
csrc/catlass/include/catlass/gemm/helper.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_dynamic_aiv.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_dynamic_common.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_dynamic_single_core_splitk.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_dynamic_small.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_dynamic_streamk.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_fa_pv.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_fa_qk.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_fai_pv_head_tail.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_fai_pv_normal.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_fai_qk_head_tail.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_fai_qk_normal.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_gemm.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_mla_pv.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_mla_pv_tp1_spec.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_mla_qk.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_mla_qk_tp1_spec.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_pingpong.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_pingpong_bias.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_pingpong_full_loadA.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_pingpong_slice_k_with_prologue.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_pingpong_with_prologue.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_preload.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_preload_async.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_preload_async_with_callback.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_single_core_splitk.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_small.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_streamk.hpp
csrc/catlass/include/catlass/gemm/block/block_mmad_w4a4_per_token_per_channel_dequant.hpp
csrc/catlass/include/catlass/gemm/block/block_scheduler_iterateK.hpp
csrc/catlass/include/catlass/gemm/block/block_scheduler_l2_misplace_core.hpp
csrc/catlass/include/catlass/gemm/block/block_swizzle.hpp
csrc/catlass/include/catlass/gemm/device/device_gemm.hpp
csrc/catlass/include/catlass/gemm/kernel/basic_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/basic_matmul_preload.hpp
csrc/catlass/include/catlass/gemm/kernel/batched_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/dynamic_aiv_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/dynamic_common_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/dynamic_local_padding_c_padding_common_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/dynamic_padding_common_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/dynamic_padding_multi_core_splitk_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/dynamic_padding_streamk_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/dynamic_single_core_splitk_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/dynamic_small_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/fp8_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/gemm.hpp
csrc/catlass/include/catlass/gemm/kernel/group_gemm.hpp
csrc/catlass/include/catlass/gemm/kernel/grouped_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/grouped_matmul_slice_k.hpp
csrc/catlass/include/catlass/gemm/kernel/grouped_matmul_slice_k_per_token_dequant.hpp
csrc/catlass/include/catlass/gemm/kernel/grouped_matmul_slice_m.hpp
csrc/catlass/include/catlass/gemm/kernel/grouped_matmul_slice_m_per_token_dequant.hpp
csrc/catlass/include/catlass/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_multistage_workspace.hpp
csrc/catlass/include/catlass/gemm/kernel/matmul_activation.hpp
csrc/catlass/include/catlass/gemm/kernel/matmul_bias.hpp
csrc/catlass/include/catlass/gemm/kernel/matmul_epilogue.hpp
csrc/catlass/include/catlass/gemm/kernel/matmul_full_loadA.hpp
csrc/catlass/include/catlass/gemm/kernel/optimized_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/padding_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/padding_splitk_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/quant_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/quant_matmul_multistage_workspace.hpp
csrc/catlass/include/catlass/gemm/kernel/single_core_slicek_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/small_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/splitk_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/streamk_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/w4a4_matmul_per_token_per_channel_dequant.hpp
csrc/catlass/include/catlass/gemm/kernel/w4a8_matmul.hpp
csrc/catlass/include/catlass/gemm/kernel/w8a16_matmul.hpp
csrc/catlass/include/catlass/gemm/tile/cast_fp8_to_fp16.hpp
csrc/catlass/include/catlass/gemm/tile/cast_int4_to_int8.hpp
csrc/catlass/include/catlass/gemm/tile/cast_int8_to_fp16.hpp
csrc/catlass/include/catlass/gemm/tile/copy_gm_to_l1.hpp
csrc/catlass/include/catlass/gemm/tile/copy_gm_to_ub.hpp
csrc/catlass/include/catlass/gemm/tile/copy_l0c_to_gm.hpp
csrc/catlass/include/catlass/gemm/tile/copy_l1_to_bt.hpp
csrc/catlass/include/catlass/gemm/tile/copy_l1_to_fp.hpp
csrc/catlass/include/catlass/gemm/tile/copy_l1_to_l0a.hpp
csrc/catlass/include/catlass/gemm/tile/copy_l1_to_l0b.hpp
csrc/catlass/include/catlass/gemm/tile/copy_ub_to_gm.hpp
csrc/catlass/include/catlass/gemm/tile/tile_copy.hpp
csrc/catlass/include/catlass/gemm/tile/tile_mmad.hpp
csrc/catlass/include/catlass/gemm/tile/tile_muls.hpp
csrc/catlass/include/catlass/gemm/tile/tile_traits.hpp
csrc/catlass/include/catlass/gemv/helper.hpp
csrc/catlass/include/catlass/gemv/block/block_gemv.hpp
csrc/catlass/include/catlass/gemv/block/block_gemv_aic.hpp
csrc/catlass/include/catlass/gemv/block/block_gemv_aiv.hpp
csrc/catlass/include/catlass/gemv/device/device_gemv.hpp
csrc/catlass/include/catlass/gemv/kernel/kernel_gemv_aic.hpp
csrc/catlass/include/catlass/gemv/kernel/kernel_gemv_aiv.hpp
csrc/catlass/include/catlass/gemv/tile/matrix_copy_gm_to_ub.hpp
csrc/catlass/include/catlass/gemv/tile/tile_copy.hpp
csrc/catlass/include/catlass/gemv/tile/tile_vmad.hpp
csrc/catlass/include/catlass/gemv/tile/tile_vmuls.hpp
csrc/catlass/include/catlass/gemv/tile/vec_copy_gm_to_ub.hpp
csrc/catlass/include/catlass/gemv/tile/vec_copy_ub_to_gm.hpp
csrc/catlass/include/catlass/layout/layout.hpp
csrc/catlass/include/catlass/layout/matrix.hpp
csrc/catlass/include/catlass/layout/tensor.hpp
csrc/catlass/include/catlass/layout/vector.hpp
csrc/catlass/tools/library/include/catlass/library/manifest.h
csrc/catlass/tools/library/include/catlass/library/operation.h
csrc/catlass/tools/library/src/gemm_operation.h
csrc/catlass/tools/library/src/library_utils.h
csrc/catlass/tools/library/src/manifest.cpp
csrc/catlass/tools/tuner/dfx_kernel/clear_l2_cache.cpp
csrc/catlass/tools/tuner/include/catlass_tuner.h
csrc/catlass/tools/tuner/include/command_line_parser.h
csrc/catlass/tools/tuner/include/device_memory_manager.h
csrc/catlass/tools/tuner/include/gemm_op_config.h
csrc/catlass/tools/tuner/include/library_helper.h
csrc/catlass/tools/tuner/include/log.h
csrc/catlass/tools/tuner/include/m_t_var.h
csrc/catlass/tools/tuner/include/metric.h
csrc/catlass/tools/tuner/include/metrics.h
csrc/catlass/tools/tuner/include/op_config.h
csrc/catlass/tools/tuner/include/op_launcher.h
csrc/catlass/tools/tuner/include/profiler.h
csrc/catlass/tools/tuner/src/catlass_tuner.cpp
csrc/catlass/tools/tuner/src/command_line_parser.cpp
csrc/catlass/tools/tuner/src/device_memory_manager.cpp
csrc/catlass/tools/tuner/src/gemm_op_config.cpp
csrc/catlass/tools/tuner/src/library_helper.cpp
csrc/catlass/tools/tuner/src/main.cpp
csrc/catlass/tools/tuner/src/metric.cpp
csrc/catlass/tools/tuner/src/metrics.cpp
csrc/catlass/tools/tuner/src/op_config.cpp
csrc/catlass/tools/tuner/src/op_launcher.cpp
csrc/catlass/tools/tuner/src/profiler.cpp
csrc/flash_attn_npu/fa_block.h
csrc/flash_attn_npu/fag_block.h
csrc/flash_attn_npu/fag_epilogue_op.hpp
csrc/flash_attn_npu/fag_epilogue_post.hpp
csrc/flash_attn_npu/fag_epilogue_pre.hpp
csrc/flash_attn_npu/fag_epilogue_sfmg.hpp
csrc/flash_attn_npu/fag_mmad_cube1.hpp
csrc/flash_attn_npu/fag_mmad_cube2.hpp
csrc/flash_attn_npu/fag_mmad_cube3.hpp
csrc/flash_attn_npu/fag_sfmg.h
csrc/flash_attn_npu/fag_tiling.cpp
csrc/flash_attn_npu/flash_api.cpp
csrc/flash_attn_npu/init_outputs.hpp
csrc/flash_attn_npu/kernel_common.hpp
csrc/flash_attn_npu/mha_fwd_kvcache.cpp
csrc/flash_attn_npu/mha_varlen_bwd.cpp
csrc/flash_attn_npu/online_softmax.hpp
csrc/flash_attn_npu/online_softmax_low_prec.hpp
csrc/flash_attn_npu/pv_matmul.hpp
csrc/flash_attn_npu/qk_matmul.hpp
csrc/flash_attn_npu/rescale_o.hpp
csrc/flash_attn_npu/rescale_o_low_prec.hpp
csrc/flash_attn_npu/softmax_tiling.cpp
csrc/flash_attn_npu/tilingdata.h
csrc/flash_attn_npu/fag_common/common_header.h
csrc/flash_attn_npu/fag_common/cube_addr.h
csrc/flash_attn_npu/fag_common/vector_addr.h
csrc/flash_attn_npu_v3/fa_block.h
csrc/flash_attn_npu_v3/flash_api.cpp
csrc/flash_attn_npu_v3/kernel_common.hpp
csrc/flash_attn_npu_v3/mha_fwd_kvcache.cpp
csrc/flash_attn_npu_v3/online_softmax.hpp
csrc/flash_attn_npu_v3/online_softmax_low_prec.hpp
csrc/flash_attn_npu_v3/pv_matmul.hpp
csrc/flash_attn_npu_v3/qk_matmul.hpp
csrc/flash_attn_npu_v3/rescale_o.hpp
csrc/flash_attn_npu_v3/rescale_o_low_prec.hpp
csrc/flash_attn_npu_v3/tilingdata.h
flash_attn_npu/__init__.py
flash_attn_npu/flash_attn_interface.py
flash_attn_npu.egg-info/PKG-INFO
flash_attn_npu.egg-info/SOURCES.txt
flash_attn_npu.egg-info/dependency_links.txt
flash_attn_npu.egg-info/requires.txt
flash_attn_npu.egg-info/top_level.txt
flash_attn_npu_v3/__init__.py
flash_attn_npu_v3/flash_attn_interface.py
tests/test_flash_attn_npu.py
tests/test_flash_attn_npu_bwd.py
tests/test_flash_attn_npu_v3.py