Offline Profile#
Source vllm-project/vllm.
1import argparse
2import inspect
3import json
4import sys
5from dataclasses import asdict, dataclass
6from typing import Optional
7
8import torch
9
10from vllm import LLM, SamplingParams
11from vllm.profiler import nm_profile
12
13BATCH_SIZE_DEFAULT = 1
14PROMPT_LEN_DEFAULT = 256
15MAX_SEQ_LEN_DEFAULT = 1024
16
17
18@dataclass
19class ProfileContext:
20 model: str
21 model_revision: str
22 sparsity: str
23 quantization: str
24 max_seq_len: int
25 max_num_batched_tokens: int
26 prompt_len: int
27 batch_size: int
28 tensor_parallel_size: int
29 allow_cuda_graphs: bool
30
31
32def run_profile(context: ProfileContext, csv_output: Optional[str],
33 json_output: Optional[str]):
34 print("Run profile with:")
35 for key, value in asdict(context).items():
36 print(f" {key} = {value}")
37
38 # Create sampling params
39 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=8)
40
41 # Create LLM
42 llm = LLM(
43 model=context.model,
44 revision=context.model_revision,
45 sparsity=context.sparsity,
46 enforce_eager=not context.allow_cuda_graphs,
47 tensor_parallel_size=context.tensor_parallel_size,
48 gpu_memory_utilization=0.9,
49 max_model_len=context.max_seq_len,
50 quantization=context.quantization,
51 max_num_batched_tokens=context.max_num_batched_tokens,
52 )
53
54 batch_size = context.batch_size
55 prompt_len = context.prompt_len
56
57 scheduler_config = llm.llm_engine.scheduler_config
58 max_num_batched_tokens = scheduler_config.max_num_batched_tokens
59 max_num_seqs = scheduler_config.max_num_seqs
60
61 if batch_size * prompt_len > max_num_batched_tokens:
62 print(f"ERROR: chosen batch_size * prompt_len "
63 f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
64 f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
65 f"and therefore cannot be run in a single profile step, please "
66 f"choose a smaller batch size or prompt length, or increase "
67 f"--max_num_batched_tokens")
68 sys.exit(-1)
69 if batch_size >= max_num_seqs:
70 print(
71 f"ERROR: chosen batch_size ({batch_size}) is larger than "
72 f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
73 f"single profile step, please choose a smaller batch size")
74 sys.exit(-1)
75
76 for i in range(batch_size):
77 llm.llm_engine.add_request(
78 request_id=f"seq{i}",
79 prompt=None,
80 prompt_token_ids=torch.randint(
81 128, # 128 to skip over special tokens
82 llm.llm_engine.model_config.get_vocab_size() // 2,
83 size=(prompt_len, )).tolist(),
84 sampling_params=sampling_params)
85
86 with nm_profile() as prefill_prof:
87 llm.llm_engine.step() # First step is prefill
88
89 with nm_profile() as decode_prof:
90 llm.llm_engine.step()
91
92 prefill_results = prefill_prof.results
93 decode_results = decode_prof.results
94
95 print("=" * 80)
96 print(f"= Prefill Model Table "
97 f"(prompt_len={prompt_len}, batch_size={batch_size})")
98 print("=" * 80)
99 print()
100 prefill_results.print_model_table()
101 print()
102 print("=" * 80)
103 print(f"= Decode Model Table "
104 f"(prompt_len={prompt_len}, batch_size={batch_size})")
105 print("=" * 80)
106 print()
107 decode_results.print_model_table()
108 print()
109 print("=" * 80)
110 print(f"= Prefill Summary Table "
111 f"(prompt_len={prompt_len}, batch_size={batch_size})")
112 print("=" * 80)
113 print()
114 prefill_results.print_summary_table()
115 print()
116 print("=" * 80)
117 print(f"= Decode Summary Table "
118 f"(prompt_len={prompt_len}, batch_size={batch_size})")
119 print("=" * 80)
120 print()
121 decode_results.print_summary_table()
122
123 if csv_output:
124 csv_filename_base = csv_output.rstrip(".csv")
125 prefill_results.export_model_stats_table_csv(
126 csv_filename_base + "_prefill_model_table.csv")
127 prefill_results.export_summary_stats_table_csv(
128 csv_filename_base + "_prefill_summary_table.csv")
129 decode_results.export_model_stats_table_csv(\
130 csv_filename_base + "_decode_model_table.csv")
131 decode_results.export_summary_stats_table_csv(
132 csv_filename_base + "_decode_summary_table.csv")
133
134 if json_output:
135 cuda_devices = [
136 torch.cuda.get_device_properties(dev_idx)
137 for dev_idx in range(torch.cuda.device_count())
138 ]
139
140 json_dict = {
141 "context": {
142 "python_version": f"{sys.version}",
143 "torch_version": f"{torch.__version__}",
144 "torch_cuda_version": f"{torch.version.cuda}",
145 "cuda_devices": f"{cuda_devices}",
146 **asdict(context)
147 },
148 "prefill": prefill_results.convert_stats_to_dict(),
149 "decode": decode_results.convert_stats_to_dict()
150 }
151
152 with open(json_output.rstrip(".json") + ".json", "w+") as f:
153 json.dump(json_dict, f, indent=2)
154 pass
155
156
157if __name__ == "__main__":
158 parser = argparse.ArgumentParser()
159
160 parser.add_argument(
161 "--model",
162 type=str,
163 required=True,
164 help='The name or path of a HuggingFace Transformers model.')
165 parser.add_argument("--model-revision", type=str, default=None)
166 parser.add_argument(
167 "--csv",
168 type=str,
169 default=None,
170 help="Export the results as multiple csv file. This should be the root "
171 "filename, will create <filename>_prefill_model_table.csv, "
172 "<filename>_prefill_summary_table.csv, "
173 "<filename>_decode_model_table.csv, and "
174 "<filename>_decode_summary_table.csv")
175 parser.add_argument(
176 "--json",
177 type=str,
178 default=None,
179 help="Export the results as a json file. This should be the filename")
180 parser.add_argument(
181 "--sparsity",
182 "-s",
183 type=str,
184 choices=[None, 'sparse_w16a16', 'semi_structured_sparse_w16a16'],
185 help="Method used to compress sparse weights. If "
186 "None, we first check the `sparsity_config` attribute"
187 "in the model config file. If that is None we assume"
188 "the model weights are dense")
189 parser.add_argument(
190 "--quantization",
191 "-q",
192 type=str,
193 choices=['awq', 'gptq', 'squeezellm', 'marlin', None],
194 default=None,
195 help="The method used to quantize the model weights, "
196 "options are \"marlin\", \"awq\", \"gptq\" and \"squeezellm\"")
197 parser.add_argument(
198 "--max-seq-len",
199 type=int,
200 default=MAX_SEQ_LEN_DEFAULT,
201 help=f"Maximum length of a sequence (including prompt and output), "
202 f"default={MAX_SEQ_LEN_DEFAULT}")
203 parser.add_argument(
204 "--max-num-batched-tokens",
205 type=int,
206 default=None,
207 help="Maximum number of tokens to be processed in a single iteration. "
208 " Should be greater than batch-size * prompt-len so the prefill can "
209 " run in a single iteration.")
210 parser.add_argument(
211 "--prompt-len",
212 type=int,
213 default=PROMPT_LEN_DEFAULT,
214 help=f"Length of the random prompt to use when profiling, all batched "
215 f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
216 parser.add_argument("--batch-size",
217 type=int,
218 default=BATCH_SIZE_DEFAULT,
219 help=f"Number of requests to run as a single batch, "
220 f"default={BATCH_SIZE_DEFAULT}")
221 parser.add_argument("--tensor-parallel-size",
222 "-tp",
223 type=int,
224 default=1,
225 help="Number of GPUs to use i.e. tensor parallelism, "
226 "default=1")
227 parser.add_argument(
228 "--allow-cuda-graphs",
229 action='store_true',
230 help="Enables cuda graphs to be used, well remove a lot of the module "
231 "level info in the profiler results since almost everything runs in "
232 "the graph where we do not have access to an informative stack trace")
233
234 args = parser.parse_args()
235
236 context = ProfileContext(
237 **{
238 k: v
239 for k, v in vars(args).items()
240 if k in inspect.signature(ProfileContext).parameters
241 })
242 run_profile(context, csv_output=args.csv, json_output=args.json)