Offline Profile#
Source vllm-project/vllm.
1import argparse
2import inspect
3import json
4import sys
5from dataclasses import asdict, dataclass
6from typing import Optional
7
8import torch
9
10from vllm import LLM, SamplingParams
11from vllm.profiler import nm_profile
12
13BATCH_SIZE_DEFAULT = 1
14PROMPT_LEN_DEFAULT = 256
15OUTPUT_LEN_DEFAULT = 2
16
17
18@dataclass
19class ProfileContext:
20 model: str
21 tokenizer: str
22 model_revision: str
23 quantization: str
24 max_model_len: int
25 max_num_batched_tokens: int
26 prompt_len: int
27 output_len: int
28 batch_size: int
29 dtype: str
30 tensor_parallel_size: int
31 allow_cuda_graphs: bool
32
33
34def run_profile(context: ProfileContext, csv_output: Optional[str],
35 json_output: Optional[str]):
36 print("Run profile with:")
37 for key, value in asdict(context).items():
38 print(f" {key} = {value}")
39
40 # Create sampling params
41 sampling_params = SamplingParams(temperature=0.8,
42 top_p=0.95,
43 max_tokens=context.output_len,
44 ignore_eos=True)
45
46 # Sparsity is in the future
47 # Create LLM
48 llm = LLM(model=context.model,
49 tokenizer=context.tokenizer
50 if context.tokenizer is not None else context.model,
51 revision=context.model_revision,
52 enforce_eager=not context.allow_cuda_graphs,
53 tensor_parallel_size=context.tensor_parallel_size,
54 gpu_memory_utilization=0.9,
55 max_model_len=context.max_model_len,
56 quantization=context.quantization,
57 dtype=context.dtype,
58 max_num_batched_tokens=context.max_num_batched_tokens)
59
60 batch_size = context.batch_size
61 prompt_len = context.prompt_len
62 output_len = context.output_len
63
64 scheduler_config = llm.llm_engine.scheduler_config
65 max_model_len = llm.llm_engine.model_config.max_model_len
66 max_num_batched_tokens = scheduler_config.max_num_batched_tokens
67 max_num_seqs = scheduler_config.max_num_seqs
68
69 if batch_size * prompt_len > max_num_batched_tokens:
70 print(f"ERROR: chosen batch_size * prompt_len "
71 f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
72 f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
73 f"and therefore cannot be run in a single profile step, please "
74 f"choose a smaller batch size or prompt length, or increase "
75 f"--max_num_batched_tokens")
76 sys.exit(-1)
77 if batch_size >= max_num_seqs:
78 print(
79 f"ERROR: chosen batch_size ({batch_size}) is larger than "
80 f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
81 f"single profile step, please choose a smaller batch size")
82 sys.exit(-1)
83 print("llm.llm_engine.model_config.max_model_len: ",
84 llm.llm_engine.model_config.max_model_len)
85 if prompt_len + output_len > llm.llm_engine.model_config.max_model_len:
86 print(
87 f"ERROR: chosen prompt_len + output_len ({prompt_len} + "
88 f"{output_len} = {prompt_len + output_len}) is larger than the "
89 f"model's max_model_len ({max_model_len}), please choose a smaller "
90 f"prompt_len or output_len, or increase --max-model-len")
91 sys.exit(-1)
92
93 for i in range(batch_size):
94 prompt_token_ids = torch.randint(
95 llm.llm_engine.model_config.get_vocab_size(),
96 size=(prompt_len, )).tolist()
97
98 llm.llm_engine.add_request(
99 request_id=f"seq{i}",
100 inputs={'prompt_token_ids': prompt_token_ids},
101 params=sampling_params)
102
103 with nm_profile() as prefill_prof:
104 llm.llm_engine.step() # First step is prefill
105
106 decode_results_list = []
107 for _ in range(context.output_len - 1):
108 with nm_profile() as decode_prof:
109 llm.llm_engine.step()
110 decode_results_list.append(decode_prof.results)
111
112 prefill_results = prefill_prof.results
113 has_decode = len(decode_results_list) > 0
114
115 print("=" * 80)
116 print(f"= Prefill Model Table "
117 f"(prompt_len={prompt_len}, batch_size={batch_size})")
118 print("=" * 80)
119 print()
120 prefill_results.print_model_table()
121
122 if has_decode:
123 print()
124 print("=" * 80)
125 print(f"= First Decode Step Model Table "
126 f"(prompt_len={prompt_len}, batch_size={batch_size})")
127 print("=" * 80)
128 print()
129 decode_results_list[0].print_model_table()
130
131 print()
132 print("=" * 80)
133 print(f"= Prefill Summary Table "
134 f"(prompt_len={prompt_len}, batch_size={batch_size})")
135 print("=" * 80)
136 print()
137 prefill_results.print_summary_table()
138 if has_decode:
139 print()
140 print("=" * 80)
141 print(f"= First Decode Step Summary Table "
142 f"(prompt_len={prompt_len}, batch_size={batch_size})")
143 print("=" * 80)
144 print()
145 decode_results_list[0].print_summary_table()
146
147 if csv_output:
148 csv_filename_base = csv_output.rstrip(".csv")
149 prefill_results.export_model_stats_table_csv(
150 csv_filename_base + "_prefill_model_table.csv")
151 prefill_results.export_summary_stats_table_csv(
152 csv_filename_base + "_prefill_summary_table.csv")
153
154 if has_decode:
155 decode_results_list[0].export_model_stats_table_csv(\
156 csv_filename_base + "_decode_model_table.csv")
157 decode_results_list[0].export_summary_stats_table_csv(
158 csv_filename_base + "_decode_summary_table.csv")
159
160 if json_output:
161 cuda_devices = [
162 torch.cuda.get_device_properties(dev_idx)
163 for dev_idx in range(torch.cuda.device_count())
164 ]
165
166 json_dict = {
167 "context": {
168 "python_version": f"{sys.version}",
169 "torch_version": f"{torch.__version__}",
170 "torch_cuda_version": f"{torch.version.cuda}",
171 "cuda_devices": f"{cuda_devices}",
172 **asdict(context)
173 },
174 "prefill": prefill_results.convert_stats_to_dict(),
175 }
176
177 if has_decode:
178 for idx, dr in enumerate(decode_results_list):
179 json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
180
181 with open(json_output.rstrip(".json") + ".json", "w+") as f:
182 json.dump(json_dict, f, indent=2)
183 pass
184
185
186if __name__ == "__main__":
187 parser = argparse.ArgumentParser()
188
189 parser.add_argument(
190 "--model",
191 type=str,
192 required=True,
193 help='The name or path of a HuggingFace Transformers model.')
194 parser.add_argument("--tokenizer",
195 type=str,
196 default=None,
197 help="path to the tokenizer")
198
199 parser.add_argument("--model-revision", type=str, default=None)
200 parser.add_argument(
201 "--csv",
202 type=str,
203 default=None,
204 help="Export the results as multiple csv file. This should be the root "
205 "filename, will create <filename>_prefill_model_table.csv, "
206 "<filename>_prefill_summary_table.csv, "
207 "<filename>_decode_model_table.csv, and "
208 "<filename>_decode_summary_table.csv")
209 parser.add_argument(
210 "--json",
211 type=str,
212 default=None,
213 help="Export the results as a json file. This should be the filename")
214 parser.add_argument(
215 "--quantization",
216 "-q",
217 type=str,
218 choices=['awq', 'gptq', 'squeezellm', 'marlin', 'smoothquant', None],
219 default=None,
220 help="The method used to quantize the model weights, options are "
221 "\"marlin\", \"awq\", \"gptq\", \"squeezellm\", \"smoothquant\"")
222 parser.add_argument("--dtype",
223 type=str,
224 default='auto',
225 help="model dtype")
226 parser.add_argument(
227 "--max-model-len",
228 type=int,
229 default=None,
230 help="Maximum length of a sequence (including prompt and output)")
231 parser.add_argument(
232 "--max-num-batched-tokens",
233 type=int,
234 default=None,
235 help="Maximum number of tokens to be processed in a single iteration. "
236 " Should be greater than batch-size * prompt-len so the prefill can "
237 " run in a single iteration.")
238 parser.add_argument(
239 "--prompt-len",
240 type=int,
241 default=PROMPT_LEN_DEFAULT,
242 help=f"Length of the random prompt to use when profiling, all batched "
243 f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
244 parser.add_argument(
245 "--output-len",
246 type=int,
247 default=OUTPUT_LEN_DEFAULT,
248 help=
249 f"Number of output decode steps to run, default={OUTPUT_LEN_DEFAULT}")
250 parser.add_argument("--batch-size",
251 type=int,
252 default=BATCH_SIZE_DEFAULT,
253 help=f"Number of requests to run as a single batch, "
254 f"default={BATCH_SIZE_DEFAULT}")
255 parser.add_argument("--tensor-parallel-size",
256 "-tp",
257 type=int,
258 default=1,
259 help="Number of GPUs to use i.e. tensor parallelism, "
260 "default=1")
261 parser.add_argument(
262 "--allow-cuda-graphs",
263 action='store_true',
264 help="Enables cuda graphs to be used, well remove a lot of the module "
265 "level info in the profiler results since almost everything runs in "
266 "the graph where we do not have access to an informative stack trace")
267
268 args = parser.parse_args()
269
270 context = ProfileContext(
271 **{
272 k: v
273 for k, v in vars(args).items()
274 if k in inspect.signature(ProfileContext).parameters
275 })
276 run_profile(context, csv_output=args.csv, json_output=args.json)