Offline Inference 24#

Source vllm-project/vllm.

 1from vllm import LLM, SamplingParams
 2
 3model = LLM("nm-testing/zephyr-50sparse-24",
 4            sparsity="semi_structured_sparse_w16a16",
 5            enforce_eager=True,
 6            dtype="float16",
 7            tensor_parallel_size=1,
 8            max_model_len=1024)
 9
10sampling_params = SamplingParams(max_tokens=100, temperature=0)
11outputs = model.generate("Hello my name is", sampling_params=sampling_params)
12print(outputs[0].outputs[0].text)