Offline Inference Sparse#

Source vllm-project/vllm.

1from vllm import LLM, SamplingParams
2
3model = LLM("nm-testing/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
4            sparsity="sparse_w16a16")
5
6sampling_params = SamplingParams(max_tokens=100, temperature=0)
7outputs = model.generate("Hello my name is", sampling_params=sampling_params)
8print(outputs[0].outputs[0].text)