Offline Inference 24#
Source vllm-project/vllm.
1from vllm import LLM, SamplingParams
2
3model = LLM("nm-testing/zephyr-50sparse-24",
4 sparsity="semi_structured_sparse_w16a16",
5 enforce_eager=True,
6 dtype="float16",
7 tensor_parallel_size=1,
8 max_model_len=1024)
9
10sampling_params = SamplingParams(max_tokens=100, temperature=0)
11outputs = model.generate("Hello my name is", sampling_params=sampling_params)
12print(outputs[0].outputs[0].text)