Metadata-Version: 2.4
Name: vllm-hust
Version: 0.17.2rc1.dev98
Summary: A high-throughput and memory-efficient inference and serving engine for LLMs
Author: vLLM Team
License-Expression: Apache-2.0
Project-URL: Homepage, https://github.com/vllm-project/vllm
Project-URL: Documentation, https://docs.vllm.ai/en/latest/
Project-URL: Slack, https://slack.vllm.ai/
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: Information Technology
Classifier: Intended Audience :: Science/Research
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
Classifier: Topic :: Scientific/Engineering :: Information Analysis
Requires-Python: <3.14,>=3.10
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: regex
Requires-Dist: cachetools
Requires-Dist: psutil
Requires-Dist: sentencepiece
Requires-Dist: numpy==1.26.4; platform_machine == "aarch64"
Requires-Dist: numpy; platform_machine != "aarch64"
Requires-Dist: requests>=2.26.0
Requires-Dist: tqdm
Requires-Dist: blake3
Requires-Dist: py-cpuinfo
Requires-Dist: transformers<5,>=4.56.0
Requires-Dist: tokenizers>=0.21.1
Requires-Dist: protobuf!=6.30.*,!=6.31.*,!=6.32.*,!=6.33.0.*,!=6.33.1.*,!=6.33.2.*,!=6.33.3.*,!=6.33.4.*,>=5.29.6
Requires-Dist: fastapi[standard]>=0.115.0
Requires-Dist: aiohttp>=3.13.3
Requires-Dist: openai>=2.0.0
Requires-Dist: pydantic>=2.12.0
Requires-Dist: prometheus_client>=0.18.0
Requires-Dist: pillow
Requires-Dist: prometheus-fastapi-instrumentator>=7.0.0
Requires-Dist: tiktoken>=0.6.0
Requires-Dist: lm-format-enforcer==0.11.3
Requires-Dist: llguidance<1.4.0,>=1.3.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x" or platform_machine == "ppc64le"
Requires-Dist: outlines_core==0.2.11
Requires-Dist: diskcache==5.6.3
Requires-Dist: lark==1.2.2
Requires-Dist: xgrammar<1.0.0,>=0.1.32; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
Requires-Dist: typing_extensions>=4.10
Requires-Dist: filelock>=3.16.1
Requires-Dist: partial-json-parser
Requires-Dist: pyzmq>=25.0.0
Requires-Dist: msgspec
Requires-Dist: gguf>=0.17.0
Requires-Dist: mistral_common[image]>=1.10.0
Requires-Dist: opencv-python-headless==4.11.0.86; platform_machine == "aarch64"
Requires-Dist: opencv-python-headless>=4.13.0; platform_machine != "aarch64"
Requires-Dist: pyyaml
Requires-Dist: six>=1.16.0; python_version > "3.11"
Requires-Dist: setuptools<81.0.0,>=77.0.3; python_version > "3.11"
Requires-Dist: einops
Requires-Dist: compressed-tensors==0.14.0.1
Requires-Dist: depyf==0.20.0
Requires-Dist: cloudpickle
Requires-Dist: watchfiles
Requires-Dist: python-json-logger
Requires-Dist: ninja
Requires-Dist: pybase64
Requires-Dist: cbor2
Requires-Dist: ijson
Requires-Dist: setproctitle
Requires-Dist: openai-harmony>=0.0.3
Requires-Dist: anthropic>=0.71.0
Requires-Dist: model-hosting-container-standards<1.0.0,>=0.1.13
Requires-Dist: mcp
Requires-Dist: opentelemetry-sdk>=1.27.0
Requires-Dist: opentelemetry-api>=1.27.0
Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
Requires-Dist: opentelemetry-semantic-conventions-ai>=0.4.1
Requires-Dist: numba==0.61.2
Requires-Dist: torch==2.10.0
Requires-Dist: torchaudio==2.10.0
Requires-Dist: torchvision==0.25.0
Requires-Dist: flashinfer-python==0.6.6
Requires-Dist: flashinfer-cubin==0.6.6
Requires-Dist: nvidia-cudnn-frontend<1.19.0,>=1.13.0
Requires-Dist: nvidia-cutlass-dsl>=4.4.0.dev1
Requires-Dist: quack-kernels>=0.2.7
Provides-Extra: zen
Requires-Dist: zentorch; extra == "zen"
Provides-Extra: bench
Requires-Dist: pandas; extra == "bench"
Requires-Dist: matplotlib; extra == "bench"
Requires-Dist: seaborn; extra == "bench"
Requires-Dist: datasets; extra == "bench"
Requires-Dist: scipy; extra == "bench"
Requires-Dist: plotly; extra == "bench"
Provides-Extra: tensorizer
Requires-Dist: tensorizer==2.10.1; extra == "tensorizer"
Provides-Extra: fastsafetensors
Requires-Dist: fastsafetensors>=0.2.2; extra == "fastsafetensors"
Provides-Extra: instanttensor
Requires-Dist: instanttensor>=0.1.5; extra == "instanttensor"
Provides-Extra: runai
Requires-Dist: runai-model-streamer[azure,gcs,s3]>=0.15.7; extra == "runai"
Provides-Extra: audio
Requires-Dist: av; extra == "audio"
Requires-Dist: resampy; extra == "audio"
Requires-Dist: scipy; extra == "audio"
Requires-Dist: soundfile; extra == "audio"
Requires-Dist: mistral_common[audio]; extra == "audio"
Provides-Extra: video
Provides-Extra: flashinfer
Provides-Extra: petit-kernel
Requires-Dist: petit-kernel; extra == "petit-kernel"
Provides-Extra: helion
Requires-Dist: helion==0.3.2; extra == "helion"
Provides-Extra: grpc
Requires-Dist: smg-grpc-servicer[vllm]>=0.5.0; extra == "grpc"
Provides-Extra: otel
Requires-Dist: opentelemetry-sdk>=1.26.0; extra == "otel"
Requires-Dist: opentelemetry-api>=1.26.0; extra == "otel"
Requires-Dist: opentelemetry-exporter-otlp>=1.26.0; extra == "otel"
Requires-Dist: opentelemetry-semantic-conventions-ai>=0.4.1; extra == "otel"
Dynamic: license-file
Dynamic: provides-extra
Dynamic: requires-dist

<!-- markdownlint-disable MD001 MD041 -->
<p align="center">
  <picture>
    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
  </picture>
</p>

<h3 align="center">
Easy, fast, and cheap LLM serving for everyone
</h3>

<p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p>

🔥 We have built a vllm website to help you get started with vllm. Please visit [vllm.ai](https://vllm.ai) to learn more.
For events, please visit [vllm.ai/events](https://vllm.ai/events) to join us.

---

## About

vLLM is a fast and easy-to-use library for LLM inference and serving.

Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.

vLLM is fast with:

- State-of-the-art serving throughput
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
- Speculative decoding
- Chunked prefill

vLLM is flexible and easy to use with:

- Seamless integration with popular Hugging Face models
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support
- Multi-LoRA support

vLLM seamlessly supports most popular open-source models on HuggingFace, including:

- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA)

Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).

## Getting Started

Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):

```bash
pip install vllm
```

### Workspace Ascend Plugin (vllm-ascend)

For this multi-root workspace, you can install local `vllm-ascend` as a
platform plugin for `vllm-hust` with:

```bash
cd /home/shuhao/vllm-ascend-hust
bash scripts/install_local_ascend_plugin.sh
```

If your `vllm-ascend` repo is in a different location:

```bash
bash /home/shuhao/vllm-ascend-hust/scripts/install_local_ascend_plugin.sh /path/to/vllm-ascend
```

This script installs `vllm-ascend` in editable mode and verifies that entry
points under `vllm.platform_plugins` are discoverable.
It defaults to lightweight mode (`COMPILE_CUSTOM_KERNELS=0`, `--no-deps`) so
you can wire the plugin in workspace even when Ascend custom-op toolchain is
not fully configured.

### Avoid Mixed Ascend Runtime (Recommended)

To avoid mixing multiple CANN/Ascend toolkit trees in one shell session,
always source a single runtime first:

```bash
cd /home/shuhao/vllm-ascend-hust
source scripts/use_single_ascend_env.sh /usr/local/Ascend/ascend-toolkit.bak.8.1/latest
```

The script now also loads `/usr/local/Ascend/nnal/atb/set_env.sh` to ensure
ATB operator runtime variables are configured. If this file is missing, install
NNAL/ATB package first.

Then run the benchmark through the wrapper (it sources the same environment
script internally):

```bash
bash /home/shuhao/vllm-ascend-hust/scripts/run_ascend_latency_bench.sh /usr/local/Ascend/ascend-toolkit.bak.8.1/latest
```

If you omit the path, scripts use a default toolkit root suitable for this
workspace.

### One-Click Ascend Bootstrap

To make local Ascend deployment closer to a one-command flow, use:

```bash
cd /home/shuhao/vllm-ascend-hust
bash scripts/bootstrap_ascend.sh Qwen/Qwen2.5-1.5B-Instruct
```

### Separate Local OpenAI Server Command

If you only want to start the local vllm-hust OpenAI-compatible server on Ascend,
use the native `vllm-hust serve` command directly instead of going through workstation:

By default, vllm-hust now auto-injects minimal Ascend runtime paths at import
time (`ASCEND_HOME_PATH`, `LD_LIBRARY_PATH`, `PATH`) so pip-installed users can
start directly without manually sourcing `set_env.sh` in common single-toolkit
setups.

If you prefer strict manual environment control, disable this behavior with:

```bash
export VLLM_ASCEND_AUTO_ENV=0
```

For multi-toolkit or customized runtime setups, manual sourcing is still
recommended:

```bash
cd /home/shuhao/vllm-ascend-hust
source scripts/use_single_ascend_env.sh /usr/local/Ascend/ascend-toolkit.bak.8.1/latest
export PYTHONPATH="/usr/local/Ascend/ascend-toolkit.bak.8.1/latest/python/site-packages:${PYTHONPATH:-}"
vllm-hust serve Qwen/Qwen2.5-1.5B-Instruct \
  --host 0.0.0.0 \
  --port 8080 \
  --enforce-eager \
  -cc.cudagraph_mode=0 \
  --enable-auto-tool-choice \
  --tool-call-parser pythonic \
  --no-enable-prefix-caching \
  --no-enable-chunked-prefill
```

If you are serving a local snapshot, replace the model argument directly:

```bash
vllm-hust serve /path/to/local/model \
  --host 0.0.0.0 \
  --port 8080 \
  --enforce-eager \
  -cc.cudagraph_mode=0 \
  --enable-auto-tool-choice \
  --tool-call-parser pythonic \
  --no-enable-prefix-caching \
  --no-enable-chunked-prefill
```

Manager integration defaults:

- manager repo path: `/home/shuhao/vllm-hust-dev-hub/ascend-runtime-manager`
- manager PyPI package: `hust-ascend-manager`
- disable manager: `HUST_DISABLE_ASCEND_MANAGER=1`
- manager strict mode: `HUST_MANAGER_STRICT=1`
- manager system install steps: `HUST_MANAGER_APPLY_SYSTEM=1`
- manager PyPI override: `HUST_ASCEND_MANAGER_PYPI_SPEC='hust-ascend-manager==0.1.0'`

If you need strict `npugraph_ex` validation, set `HUST_REQUIRE_NPUGRAPH=1`
before running the script.

Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.

- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)

## Contributing

We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.

## Citation

If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):

```bibtex
@inproceedings{kwon2023efficient,
  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
  author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
  booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
  year={2023}
}
```

## Contact Us

<!-- --8<-- [start:contact-us] -->
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [collaboration@vllm.ai](mailto:collaboration@vllm.ai)
<!-- --8<-- [end:contact-us] -->

## Media Kit

- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
