.clang-format
.gitignore
LICENSE
README.md
pyproject.toml
ruff.toml
benchmarks/bench_cutlass_w8a8.py
benchmarks/bench_humming.py
benchmarks/bench_marlin.py
benchmarks/bench_torch_w16a16.py
benchmarks/bench_triton_moe.py
benchmarks/results/4090/dense_n8192_k8192_g0/cutlass_w8a8_float8e4m3_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w1a16_int1_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w1a8_int1_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w2a16_int2_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w2a8_int2_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w3a16_int3_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w3a8_int3_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w4a16_int4_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w4a8_float4e2m1_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w5a16_int5_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w5a8_float5e2m2_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w6a16_int6_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w6a8_float6e2m3_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w7a16_int7_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w7a8_float7e3m3_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w8a16_int8_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/humming_w8a8_float8e4m3_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0/marlin_w4a16_int4_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/marlin_w4a8_int4_int8.json
benchmarks/results/4090/dense_n8192_k8192_g0/marlin_w8a16_float8e4m3_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0/torch_w16a16_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w1a16_int1_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w1a8_int1_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w2a16_int2_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w2a8_int2_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w3a16_int3_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w3a8_int3_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w4a16_int4_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w4a8_float4e2m1_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w5a16_int5_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w5a8_float5e2m2_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w6a16_int6_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w6a8_float6e2m3_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w7a16_int7_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w7a8_float7e3m3_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w8a16_int8_float16.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/humming_w8a8_float8e4m3_float8e4m3.json
benchmarks/results/4090/dense_n8192_k8192_g0_f16accum/torch_w16a16_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/cutlass_w8a8_int8_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w1a16_int1_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w1a4_int1_int4.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w1a8_int1_int8.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w2a16_int2_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w2a4_int2_int4.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w2a8_int2_int8.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w3a16_int3_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w3a4_int3_int4.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w3a8_int3_int8.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w4a16_int4_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w4a4_int4_int4.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w4a8_int4_int8.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w5a16_int5_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w5a8_int5_int8.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w6a16_int6_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w6a8_float6e3m2_int8.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w7a16_int7_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w7a8_float7e4m2_int8.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w8a16_int8_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/humming_w8a8_int8_int8.json
benchmarks/results/a800/dense_n8192_k8192_g0/marlin_w4a16_int4_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/marlin_w4a8_int4_int8.json
benchmarks/results/a800/dense_n8192_k8192_g0/marlin_w8a16_int8_float16.json
benchmarks/results/a800/dense_n8192_k8192_g0/torch_w16a16_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/cutlass_w8a8_float8e4m3_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w1a16_int1_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w1a8_int1_float8e4m3.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w2a16_int2_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w2a8_int2_float8e4m3.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w3a16_int3_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w3a8_int3_float8e4m3.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w4a16_int4_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w4a8_int4_float8e4m3.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w5a16_int5_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w5a8_int5_float8e4m3.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w6a16_int6_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w6a8_float6e3m2_float8e4m3.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w7a16_int7_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w7a8_float7e4m2_float8e4m3.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w8a16_int8_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/humming_w8a8_float8e4m3_float8e4m3.json
benchmarks/results/h20/dense_n8192_k8192_g0/marlin_w4a16_int4_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/marlin_w4a8_int4_int8.json
benchmarks/results/h20/dense_n8192_k8192_g0/marlin_w8a16_float8e4m3_float16.json
benchmarks/results/h20/dense_n8192_k8192_g0/torch_w16a16_float16.json
benchmarks/results/h20/moe_e256_k8_n512_k7168_g128x128/humming_w1a8_int1_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n512_k7168_g128x128/humming_w2a8_int2_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n512_k7168_g128x128/humming_w3a8_int3_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n512_k7168_g128x128/humming_w4a8_int4_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n512_k7168_g128x128/humming_w5a8_int5_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n512_k7168_g128x128/humming_w6a8_float6e3m2_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n512_k7168_g128x128/humming_w7a8_float7e4m2_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n512_k7168_g128x128/humming_w8a8_float8e4m3_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n512_k7168_g128x128/triton_w8a8_float8e4m3_float16.json
benchmarks/results/h20/moe_e256_k8_n7168_k256_g128x128/humming_w1a8_int1_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n7168_k256_g128x128/humming_w2a8_int2_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n7168_k256_g128x128/humming_w3a8_int3_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n7168_k256_g128x128/humming_w4a8_int4_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n7168_k256_g128x128/humming_w5a8_int5_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n7168_k256_g128x128/humming_w6a8_float6e3m2_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n7168_k256_g128x128/humming_w7a8_float7e4m2_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n7168_k256_g128x128/humming_w8a8_float8e4m3_float8e4m3.json
benchmarks/results/h20/moe_e256_k8_n7168_k256_g128x128/triton_w8a8_float8e4m3_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/cutlass_w8a8_float8e4m3_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w1a16_int1_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w1a8_int1_float8e4m3.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w2a16_int2_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w2a8_int2_float8e4m3.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w3a16_int3_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w3a8_int3_float8e4m3.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w4a16_int4_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w4a8_float4e2m1_float8e4m3.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w5a16_int5_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w5a8_float5e2m2_float8e4m3.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w6a16_int6_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w6a8_float6e2m3_float8e4m3.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w7a16_int7_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w7a8_float7e3m3_float8e4m3.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w8a16_int8_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/humming_w8a8_float8e4m3_float8e4m3.json
benchmarks/results/h200/dense_n8192_k8192_g0/marlin_w4a16_int4_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/marlin_w4a8_int4_int8.json
benchmarks/results/h200/dense_n8192_k8192_g0/marlin_w8a16_int8_float16.json
benchmarks/results/h200/dense_n8192_k8192_g0/torch_w16a16_float16.json
docs/config.md
humming/__init__.py
humming/dtypes.py
humming/layer.py
humming/config/__init__.py
humming/config/base.py
humming/config/config.py
humming/config/enum.py
humming/config/mma.py
humming/csrc/launcher/elf.h
humming/csrc/launcher/launcher.cpp
humming/csrc/launcher/tensor.h
humming/csrc/launcher/tma.h
humming/csrc/launcher/torch_api.h
humming/csrc/launcher/utils.h
humming/include/humming/scheduler.cuh
humming/include/humming/arith/epilogue_arith.cuh
humming/include/humming/arith/exp_offset.cuh
humming/include/humming/arith/mainloop_arith.cuh
humming/include/humming/datatype/base_conversion.cuh
humming/include/humming/datatype/dequant.cuh
humming/include/humming/datatype/dequant_fused.cuh
humming/include/humming/datatype/dequant_prepare.cuh
humming/include/humming/datatype/dequant_single.cuh
humming/include/humming/datatype/dtypes.cuh
humming/include/humming/epilogue/gmem_writer.cuh
humming/include/humming/epilogue/pipeline.cuh
humming/include/humming/epilogue/smem_reducer.cuh
humming/include/humming/epilogue/smem_writer.cuh
humming/include/humming/kernel/dequant_weight.cuh
humming/include/humming/kernel/humming.cuh
humming/include/humming/kernel/humming_ws.cuh
humming/include/humming/kernel/pack_weight.cuh
humming/include/humming/kernel/process.cuh
humming/include/humming/kernel/process_mxfp4.cuh
humming/include/humming/kernel/quant_weight.cuh
humming/include/humming/kernel/tops_bench.cuh
humming/include/humming/memory/g2s_pipeline.cuh
humming/include/humming/memory/s2r_pipeline.cuh
humming/include/humming/memory/g2s_loader/loader_a.cuh
humming/include/humming/memory/g2s_loader/loader_as.cuh
humming/include/humming/memory/g2s_loader/loader_b.cuh
humming/include/humming/memory/g2s_loader/loader_bias.cuh
humming/include/humming/memory/g2s_loader/loader_bs.cuh
humming/include/humming/memory/g2s_loader/loader_bzp.cuh
humming/include/humming/memory/s2r_loader/loader_a.cuh
humming/include/humming/memory/s2r_loader/loader_as.cuh
humming/include/humming/memory/s2r_loader/loader_b.cuh
humming/include/humming/memory/s2r_loader/loader_bias.cuh
humming/include/humming/memory/s2r_loader/loader_bs.cuh
humming/include/humming/memory/s2r_loader/loader_bzp.cuh
humming/include/humming/mma/wgmma.cuh
humming/include/humming/mma/wmma.cuh
humming/include/humming/utils/all.cuh
humming/include/humming/utils/base.cuh
humming/include/humming/utils/enum.cuh
humming/include/humming/utils/storage.cuh
humming/include/humming/utils/ptx/barrier.cuh
humming/include/humming/utils/ptx/legacy_load.cuh
humming/include/humming/utils/ptx/math.cuh
humming/include/humming/utils/ptx/shared.cuh
humming/include/humming/utils/ptx/tma.cuh
humming/include/humming/utils/ptx/warp.cuh
humming/include/humming/utils/ptx/wgmma.cuh
humming/jit/__init__.py
humming/jit/compiler.py
humming/jit/runtime.py
humming/kernel/__init__.py
humming/kernel/dequant_weight.py
humming/kernel/humming.py
humming/kernel/pack_weight.py
humming/kernel/process_mxfp4.py
humming/kernel/quant_weight.py
humming/kernel/repack_weight.py
humming/kernel/tops_bench.py
humming/kernel/unpack_weight.py
humming/ops/__init__.py
humming/ops/bench.py
humming/ops/input.py
humming/ops/moe.py
humming/ops/utils.py
humming/ops/weight.py
humming/schema/__init__.py
humming/schema/awq.py
humming/schema/base.py
humming/schema/bitnet.py
humming/schema/compressed_tensors.py
humming/schema/fp8.py
humming/schema/gpt_oss_mxfp4.py
humming/schema/gptq.py
humming/schema/humming.py
humming/schema/modelopt.py
humming/schema/mxfp4.py
humming/tune/__init__.py
humming/tune/base.py
humming/tune/sm100.py
humming/tune/sm75.py
humming/tune/sm8x.py
humming/tune/sm90.py
humming/tune/sm90_h20.py
humming/utils/__init__.py
humming/utils/cuda.py
humming/utils/device.py
humming/utils/jit.py
humming/utils/smem.py
humming/utils/test.py
humming/utils/weight.py
humming_kernels.egg-info/PKG-INFO
humming_kernels.egg-info/SOURCES.txt
humming_kernels.egg-info/dependency_links.txt
humming_kernels.egg-info/requires.txt
humming_kernels.egg-info/top_level.txt
tests/test_batch_invariance.py
tests/test_datatype.py
tests/test_epilogue.py
tests/test_f16_accum.py
tests/test_moe.py
tests/test_multi_cast.py
tests/test_pad.py
tests/test_pipeline.py
tests/test_scale.py
tests/test_shape.py
tests/test_zero_point.py