README.md
pyproject.toml
setup.py
src/exllamav3/__init__.py
src/exllamav3/constants.py
src/exllamav3/ext.py
src/exllamav3/model_init.py
src/exllamav3/version.py
src/exllamav3/architecture/__init__.py
src/exllamav3/architecture/apertus.py
src/exllamav3/architecture/arcee.py
src/exllamav3/architecture/architectures.py
src/exllamav3/architecture/cohere.py
src/exllamav3/architecture/cohere2.py
src/exllamav3/architecture/decilm.py
src/exllamav3/architecture/deepseek_v2.py
src/exllamav3/architecture/deepseek_vl2.py
src/exllamav3/architecture/dots1.py
src/exllamav3/architecture/ernie4_5.py
src/exllamav3/architecture/ernie4_5_moe.py
src/exllamav3/architecture/exaone4.py
src/exllamav3/architecture/gemma2.py
src/exllamav3/architecture/gemma3.py
src/exllamav3/architecture/glm4.py
src/exllamav3/architecture/glm4_moe.py
src/exllamav3/architecture/glm4v.py
src/exllamav3/architecture/glm4v_moe.py
src/exllamav3/architecture/hcxvisionv2.py
src/exllamav3/architecture/hyperclovax.py
src/exllamav3/architecture/iquestcoder.py
src/exllamav3/architecture/llama.py
src/exllamav3/architecture/mimo.py
src/exllamav3/architecture/minimax_m2.py
src/exllamav3/architecture/ministral3.py
src/exllamav3/architecture/mistral.py
src/exllamav3/architecture/mistral3.py
src/exllamav3/architecture/mixtral.py
src/exllamav3/architecture/nanochat.py
src/exllamav3/architecture/olmo3.py
src/exllamav3/architecture/phi3.py
src/exllamav3/architecture/qwen2.py
src/exllamav3/architecture/qwen2_5_vl.py
src/exllamav3/architecture/qwen3.py
src/exllamav3/architecture/qwen3_5.py
src/exllamav3/architecture/qwen3_moe.py
src/exllamav3/architecture/qwen3_next.py
src/exllamav3/architecture/qwen3_vl.py
src/exllamav3/architecture/qwen3_vl_moe.py
src/exllamav3/architecture/seedoss.py
src/exllamav3/architecture/smollm3.py
src/exllamav3/architecture/solar_open_moe.py
src/exllamav3/cache/__init__.py
src/exllamav3/cache/cache.py
src/exllamav3/cache/fp16.py
src/exllamav3/cache/quant.py
src/exllamav3/cache/recurrent.py
src/exllamav3/conversion/__init__.py
src/exllamav3/conversion/allocation.py
src/exllamav3/exllamav3_ext/activation.cu
src/exllamav3/exllamav3_ext/activation.cuh
src/exllamav3/exllamav3_ext/activation_kernels.cuh
src/exllamav3/exllamav3_ext/add.cu
src/exllamav3/exllamav3_ext/add.cuh
src/exllamav3/exllamav3_ext/avx2_target.cpp
src/exllamav3/exllamav3_ext/avx2_target.h
src/exllamav3/exllamav3_ext/bindings.cpp
src/exllamav3/exllamav3_ext/causal_conv1d.cu
src/exllamav3/exllamav3_ext/causal_conv1d.cuh
src/exllamav3/exllamav3_ext/compat.cuh
src/exllamav3/exllamav3_ext/gdn.cuh
src/exllamav3/exllamav3_ext/gnd.cu
src/exllamav3/exllamav3_ext/graph.cu
src/exllamav3/exllamav3_ext/graph.cuh
src/exllamav3/exllamav3_ext/hadamard.cpp
src/exllamav3/exllamav3_ext/hadamard.h
src/exllamav3/exllamav3_ext/hgemm.cu
src/exllamav3/exllamav3_ext/hgemm.cuh
src/exllamav3/exllamav3_ext/histogram.cu
src/exllamav3/exllamav3_ext/histogram.cuh
src/exllamav3/exllamav3_ext/norm.cu
src/exllamav3/exllamav3_ext/norm.cuh
src/exllamav3/exllamav3_ext/ptx.cuh
src/exllamav3/exllamav3_ext/reduction.cuh
src/exllamav3/exllamav3_ext/rope.cu
src/exllamav3/exllamav3_ext/rope.cuh
src/exllamav3/exllamav3_ext/routing.cu
src/exllamav3/exllamav3_ext/routing.cuh
src/exllamav3/exllamav3_ext/softcap.cu
src/exllamav3/exllamav3_ext/softcap.cuh
src/exllamav3/exllamav3_ext/stloader.cpp
src/exllamav3/exllamav3_ext/stloader.h
src/exllamav3/exllamav3_ext/stloader_cu.cu
src/exllamav3/exllamav3_ext/stloader_cu.cuh
src/exllamav3/exllamav3_ext/util.cuh
src/exllamav3/exllamav3_ext/util.h
src/exllamav3/exllamav3_ext/cache/q_cache.cu
src/exllamav3/exllamav3_ext/cache/q_cache.cuh
src/exllamav3/exllamav3_ext/cache/q_cache_kernels.cuh
src/exllamav3/exllamav3_ext/generator/cache.cu
src/exllamav3/exllamav3_ext/generator/cache.cuh
src/exllamav3/exllamav3_ext/generator/gumbel.cu
src/exllamav3/exllamav3_ext/generator/gumbel.cuh
src/exllamav3/exllamav3_ext/generator/rep_pen.cu
src/exllamav3/exllamav3_ext/generator/rep_pen.cuh
src/exllamav3/exllamav3_ext/generator/sampling_basic.cu
src/exllamav3/exllamav3_ext/generator/sampling_basic.cuh
src/exllamav3/exllamav3_ext/generator/sampling_extra.cu
src/exllamav3/exllamav3_ext/generator/sampling_extra.cuh
src/exllamav3/exllamav3_ext/generator/strings.cpp
src/exllamav3/exllamav3_ext/generator/strings.h
src/exllamav3/exllamav3_ext/libtorch/blocksparse_mlp.cpp
src/exllamav3/exllamav3_ext/libtorch/blocksparse_mlp.h
src/exllamav3/exllamav3_ext/libtorch/blocksparse_mlp_bc.h
src/exllamav3/exllamav3_ext/libtorch/gated_delta_net.cpp
src/exllamav3/exllamav3_ext/libtorch/gated_delta_net.h
src/exllamav3/exllamav3_ext/libtorch/gated_delta_net_bc.h
src/exllamav3/exllamav3_ext/libtorch/gated_rmsnorm.cpp
src/exllamav3/exllamav3_ext/libtorch/gated_rmsnorm.h
src/exllamav3/exllamav3_ext/libtorch/gated_rmsnorm_bc.h
src/exllamav3/exllamav3_ext/libtorch/linear.cpp
src/exllamav3/exllamav3_ext/libtorch/linear.h
src/exllamav3/exllamav3_ext/libtorch/linear_bc.h
src/exllamav3/exllamav3_ext/libtorch/mlp.cpp
src/exllamav3/exllamav3_ext/libtorch/mlp.h
src/exllamav3/exllamav3_ext/libtorch/mlp_bc.h
src/exllamav3/exllamav3_ext/parallel/all_reduce.cu
src/exllamav3/exllamav3_ext/parallel/all_reduce.cuh
src/exllamav3/exllamav3_ext/parallel/all_reduce_cpu.cu
src/exllamav3/exllamav3_ext/parallel/all_reduce_cpu_avx2.cpp
src/exllamav3/exllamav3_ext/parallel/all_reduce_cpu_avx2.h
src/exllamav3/exllamav3_ext/parallel/barrier.cu
src/exllamav3/exllamav3_ext/parallel/barrier.cuh
src/exllamav3/exllamav3_ext/parallel/barrier_inner.cuh
src/exllamav3/exllamav3_ext/parallel/broadcast.cu
src/exllamav3/exllamav3_ext/parallel/broadcast.cuh
src/exllamav3/exllamav3_ext/parallel/context.cu
src/exllamav3/exllamav3_ext/parallel/context.cuh
src/exllamav3/exllamav3_ext/parallel/gather.cu
src/exllamav3/exllamav3_ext/parallel/gather.cuh
src/exllamav3/exllamav3_ext/parallel/ll.cuh
src/exllamav3/exllamav3_ext/parallel/timeout.cuh
src/exllamav3/exllamav3_ext/quant/codebook.cuh
src/exllamav3/exllamav3_ext/quant/exl3_devctx.cu
src/exllamav3/exllamav3_ext/quant/exl3_devctx.cuh
src/exllamav3/exllamav3_ext/quant/exl3_dq.cuh
src/exllamav3/exllamav3_ext/quant/exl3_gemm.cu
src/exllamav3/exllamav3_ext/quant/exl3_gemm.cuh
src/exllamav3/exllamav3_ext/quant/exl3_gemm_inner.cuh
src/exllamav3/exllamav3_ext/quant/exl3_gemm_kernel.cuh
src/exllamav3/exllamav3_ext/quant/exl3_gemv.cu
src/exllamav3/exllamav3_ext/quant/exl3_gemv.cuh
src/exllamav3/exllamav3_ext/quant/exl3_gemv_kernel.cuh
src/exllamav3/exllamav3_ext/quant/exl3_kernel_map.cu
src/exllamav3/exllamav3_ext/quant/exl3_kernel_map.cuh
src/exllamav3/exllamav3_ext/quant/exl3_kernel_map_samples.cuh
src/exllamav3/exllamav3_ext/quant/hadamard.cu
src/exllamav3/exllamav3_ext/quant/hadamard.cuh
src/exllamav3/exllamav3_ext/quant/hadamard_inner.cuh
src/exllamav3/exllamav3_ext/quant/pack.cu
src/exllamav3/exllamav3_ext/quant/pack.cuh
src/exllamav3/exllamav3_ext/quant/quantize.cu
src/exllamav3/exllamav3_ext/quant/quantize.cuh
src/exllamav3/exllamav3_ext/quant/reconstruct.cu
src/exllamav3/exllamav3_ext/quant/reconstruct.cuh
src/exllamav3/exllamav3_ext/quant/util.cu
src/exllamav3/exllamav3_ext/quant/util.cuh
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_1.cu
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_1.cuh
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_2.cu
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_2.cuh
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_3.cu
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_3.cuh
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_4.cu
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_4.cuh
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_5.cu
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_5.cuh
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_6.cu
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_6.cuh
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_7.cu
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_7.cuh
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_8.cu
src/exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_8.cuh
src/exllamav3/generator/__init__.py
src/exllamav3/generator/sampler/__init__.py
src/exllamav3/generator/sampler/custom.py
src/exllamav3/generator/sampler/presets.py
src/exllamav3/generator/sampler/sampler.py
src/exllamav3/loader/__init__.py
src/exllamav3/loader/safetensors.py
src/exllamav3/loader/safetensors_alt.py
src/exllamav3/model/__init__.py
src/exllamav3/model/config.py
src/exllamav3/model/model.py
src/exllamav3/model/model_ls.py
src/exllamav3/model/model_tp.py
src/exllamav3/model/model_tp_alloc.py
src/exllamav3/model/model_tp_backend.py
src/exllamav3/model/model_tp_cuda.py
src/exllamav3/model/model_tp_fn.py
src/exllamav3/model/model_tp_shared.py
src/exllamav3/modules/__init__.py
src/exllamav3/modules/attn.py
src/exllamav3/modules/block_sparse_mlp.py
src/exllamav3/modules/conv.py
src/exllamav3/modules/deepseek_v2_mla_attn.py
src/exllamav3/modules/deepstack.py
src/exllamav3/modules/embedding.py
src/exllamav3/modules/gated_delta_net.py
src/exllamav3/modules/gated_rmsnorm.py
src/exllamav3/modules/gather.py
src/exllamav3/modules/glm4v_pos_embedding.py
src/exllamav3/modules/layernorm.py
src/exllamav3/modules/linear.py
src/exllamav3/modules/mlp.py
src/exllamav3/modules/module.py
src/exllamav3/modules/multilinear.py
src/exllamav3/modules/pos_embedding.py
src/exllamav3/modules/qwen3_vl_pos_embedding.py
src/exllamav3/modules/rmsnorm.py
src/exllamav3/modules/transformer.py
src/exllamav3/modules/quant/__init__.py
src/exllamav3/modules/quant/exl3.py
src/exllamav3/modules/quant/fp16.py
src/exllamav3/modules/quant/exl3_lib/__init__.py
src/exllamav3/modules/quant/exl3_lib/quantize.py
src/exllamav3/tokenizer/__init__.py
src/exllamav3/tokenizer/mm_embedding.py
src/exllamav3/tokenizer/tokenizer.py
src/exllamav3/util/__init__.py
src/exllamav3/util/arch_list.py
src/exllamav3/util/debug.py
src/exllamav3/util/file.py
src/exllamav3/util/hadamard.py
src/exllamav3/util/measures.py
src/exllamav3/util/memory.py
src/exllamav3/util/misc.py
src/exllamav3/util/profile_opt.py
src/exllamav3/util/progress.py
src/exllamav3/util/rope.py
src/exllamav3/util/tensor.py
src/exllamav3/util/vision.py
src/exllamav3/util/hadamard_data/hadamard_1.txt
src/exllamav3/util/hadamard_data/hadamard_100.txt
src/exllamav3/util/hadamard_data/hadamard_116.txt
src/exllamav3/util/hadamard_data/hadamard_156.txt
src/exllamav3/util/hadamard_data/hadamard_172.txt
src/exllamav3/util/hadamard_data/hadamard_188.txt
src/exllamav3/util/hadamard_data/hadamard_236.txt
src/exllamav3/util/hadamard_data/hadamard_244.txt
src/exllamav3/util/hadamard_data/hadamard_428.txt
src/exllamav3/util/hadamard_data/hadamard_52.txt
src/exllamav3/util/hadamard_data/hadamard_92.txt
src/exllamav3/util/hadamard_data/primes.txt
src/exllamav3_inference.egg-info/PKG-INFO
src/exllamav3_inference.egg-info/SOURCES.txt
src/exllamav3_inference.egg-info/dependency_links.txt
src/exllamav3_inference.egg-info/requires.txt
src/exllamav3_inference.egg-info/top_level.txt
src/exllamav3_opt/__init__.py
src/exllamav3_opt/compile.py
src/exllamav3_opt/fp8_cache.py
src/exllamav3_opt/generator.py
src/exllamav3_opt/integration.py
src/exllamav3_opt/prefix_cache.py
src/exllamav3_opt/tensor_pool.py
src/exllamav3_opt/_ext/__init__.py
tests/test_fp8_cache.py
tests/test_fused_kernels.py
tests/test_generator.py
tests/test_integration.py
tests/test_prefix_cache.py