Offline Profile#

Source vllm-project/vllm.

  1import argparse
  2import inspect
  3import json
  4import sys
  5from dataclasses import asdict, dataclass
  6from typing import Optional
  7
  8import torch
  9
 10from vllm import LLM, SamplingParams
 11from vllm.profiler import nm_profile
 12
 13BATCH_SIZE_DEFAULT = 1
 14PROMPT_LEN_DEFAULT = 256
 15OUTPUT_LEN_DEFAULT = 2
 16
 17
 18@dataclass
 19class ProfileContext:
 20    model: str
 21    tokenizer: str
 22    model_revision: str
 23    quantization: str
 24    max_model_len: int
 25    max_num_batched_tokens: int
 26    prompt_len: int
 27    output_len: int
 28    batch_size: int
 29    dtype: str
 30    tensor_parallel_size: int
 31    allow_cuda_graphs: bool
 32
 33
 34def run_profile(context: ProfileContext, csv_output: Optional[str],
 35                json_output: Optional[str]):
 36    print("Run profile with:")
 37    for key, value in asdict(context).items():
 38        print(f"  {key} = {value}")
 39
 40    # Create sampling params
 41    sampling_params = SamplingParams(temperature=0.8,
 42                                     top_p=0.95,
 43                                     max_tokens=context.output_len,
 44                                     ignore_eos=True)
 45
 46    # Sparsity is in the future
 47    # Create LLM
 48    llm = LLM(model=context.model,
 49              tokenizer=context.tokenizer
 50              if context.tokenizer is not None else context.model,
 51              revision=context.model_revision,
 52              enforce_eager=not context.allow_cuda_graphs,
 53              tensor_parallel_size=context.tensor_parallel_size,
 54              gpu_memory_utilization=0.9,
 55              max_model_len=context.max_model_len,
 56              quantization=context.quantization,
 57              dtype=context.dtype,
 58              max_num_batched_tokens=context.max_num_batched_tokens)
 59
 60    batch_size = context.batch_size
 61    prompt_len = context.prompt_len
 62    output_len = context.output_len
 63
 64    scheduler_config = llm.llm_engine.scheduler_config
 65    max_model_len = llm.llm_engine.model_config.max_model_len
 66    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
 67    max_num_seqs = scheduler_config.max_num_seqs
 68
 69    if batch_size * prompt_len > max_num_batched_tokens:
 70        print(f"ERROR: chosen batch_size * prompt_len "
 71              f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
 72              f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
 73              f"and therefore cannot be run in a single profile step, please "
 74              f"choose a smaller batch size or prompt length, or increase "
 75              f"--max_num_batched_tokens")
 76        sys.exit(-1)
 77    if batch_size >= max_num_seqs:
 78        print(
 79            f"ERROR: chosen batch_size ({batch_size}) is larger than "
 80            f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
 81            f"single profile step, please choose a smaller batch size")
 82        sys.exit(-1)
 83    print("llm.llm_engine.model_config.max_model_len: ",
 84          llm.llm_engine.model_config.max_model_len)
 85    if prompt_len + output_len > llm.llm_engine.model_config.max_model_len:
 86        print(
 87            f"ERROR: chosen prompt_len + output_len ({prompt_len} + "
 88            f"{output_len} = {prompt_len + output_len}) is larger than the "
 89            f"model's max_model_len ({max_model_len}), please choose a smaller "
 90            f"prompt_len or output_len, or increase --max-model-len")
 91        sys.exit(-1)
 92
 93    for i in range(batch_size):
 94        prompt_token_ids = torch.randint(
 95            llm.llm_engine.model_config.get_vocab_size(),
 96            size=(prompt_len, )).tolist()
 97
 98        llm.llm_engine.add_request(
 99            request_id=f"seq{i}",
100            inputs={'prompt_token_ids': prompt_token_ids},
101            params=sampling_params)
102
103    with nm_profile() as prefill_prof:
104        llm.llm_engine.step()  # First step is prefill
105
106    decode_results_list = []
107    for _ in range(context.output_len - 1):
108        with nm_profile() as decode_prof:
109            llm.llm_engine.step()
110        decode_results_list.append(decode_prof.results)
111
112    prefill_results = prefill_prof.results
113    has_decode = len(decode_results_list) > 0
114
115    print("=" * 80)
116    print(f"= Prefill Model Table "
117          f"(prompt_len={prompt_len}, batch_size={batch_size})")
118    print("=" * 80)
119    print()
120    prefill_results.print_model_table()
121
122    if has_decode:
123        print()
124        print("=" * 80)
125        print(f"= First Decode Step Model Table "
126              f"(prompt_len={prompt_len}, batch_size={batch_size})")
127        print("=" * 80)
128        print()
129        decode_results_list[0].print_model_table()
130
131    print()
132    print("=" * 80)
133    print(f"= Prefill Summary Table "
134          f"(prompt_len={prompt_len}, batch_size={batch_size})")
135    print("=" * 80)
136    print()
137    prefill_results.print_summary_table()
138    if has_decode:
139        print()
140        print("=" * 80)
141        print(f"= First Decode Step Summary Table "
142              f"(prompt_len={prompt_len}, batch_size={batch_size})")
143        print("=" * 80)
144        print()
145        decode_results_list[0].print_summary_table()
146
147    if csv_output:
148        csv_filename_base = csv_output.rstrip(".csv")
149        prefill_results.export_model_stats_table_csv(
150            csv_filename_base + "_prefill_model_table.csv")
151        prefill_results.export_summary_stats_table_csv(
152            csv_filename_base + "_prefill_summary_table.csv")
153
154        if has_decode:
155            decode_results_list[0].export_model_stats_table_csv(\
156                csv_filename_base + "_decode_model_table.csv")
157            decode_results_list[0].export_summary_stats_table_csv(
158                csv_filename_base + "_decode_summary_table.csv")
159
160    if json_output:
161        cuda_devices = [
162            torch.cuda.get_device_properties(dev_idx)
163            for dev_idx in range(torch.cuda.device_count())
164        ]
165
166        json_dict = {
167            "context": {
168                "python_version": f"{sys.version}",
169                "torch_version": f"{torch.__version__}",
170                "torch_cuda_version": f"{torch.version.cuda}",
171                "cuda_devices": f"{cuda_devices}",
172                **asdict(context)
173            },
174            "prefill": prefill_results.convert_stats_to_dict(),
175        }
176
177        if has_decode:
178            for idx, dr in enumerate(decode_results_list):
179                json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
180
181        with open(json_output.rstrip(".json") + ".json", "w+") as f:
182            json.dump(json_dict, f, indent=2)
183        pass
184
185
186if __name__ == "__main__":
187    parser = argparse.ArgumentParser()
188
189    parser.add_argument(
190        "--model",
191        type=str,
192        required=True,
193        help='The name or path of a HuggingFace Transformers model.')
194    parser.add_argument("--tokenizer",
195                        type=str,
196                        default=None,
197                        help="path to the tokenizer")
198
199    parser.add_argument("--model-revision", type=str, default=None)
200    parser.add_argument(
201        "--csv",
202        type=str,
203        default=None,
204        help="Export the results as multiple csv file. This should be the root "
205        "filename, will create <filename>_prefill_model_table.csv, "
206        "<filename>_prefill_summary_table.csv, "
207        "<filename>_decode_model_table.csv, and "
208        "<filename>_decode_summary_table.csv")
209    parser.add_argument(
210        "--json",
211        type=str,
212        default=None,
213        help="Export the results as a json file. This should be the filename")
214    parser.add_argument(
215        "--quantization",
216        "-q",
217        type=str,
218        choices=['awq', 'gptq', 'squeezellm', 'marlin', 'smoothquant', None],
219        default=None,
220        help="The method used to quantize the model weights, options are "
221        "\"marlin\", \"awq\", \"gptq\", \"squeezellm\", \"smoothquant\"")
222    parser.add_argument("--dtype",
223                        type=str,
224                        default='auto',
225                        help="model dtype")
226    parser.add_argument(
227        "--max-model-len",
228        type=int,
229        default=None,
230        help="Maximum length of a sequence (including prompt and output)")
231    parser.add_argument(
232        "--max-num-batched-tokens",
233        type=int,
234        default=None,
235        help="Maximum number of tokens to be processed in a single iteration. "
236        " Should be greater than batch-size * prompt-len so the prefill can "
237        " run in a single iteration.")
238    parser.add_argument(
239        "--prompt-len",
240        type=int,
241        default=PROMPT_LEN_DEFAULT,
242        help=f"Length of the random prompt to use when profiling, all batched "
243        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
244    parser.add_argument(
245        "--output-len",
246        type=int,
247        default=OUTPUT_LEN_DEFAULT,
248        help=
249        f"Number of output decode steps to run, default={OUTPUT_LEN_DEFAULT}")
250    parser.add_argument("--batch-size",
251                        type=int,
252                        default=BATCH_SIZE_DEFAULT,
253                        help=f"Number of requests to run as a single batch, "
254                        f"default={BATCH_SIZE_DEFAULT}")
255    parser.add_argument("--tensor-parallel-size",
256                        "-tp",
257                        type=int,
258                        default=1,
259                        help="Number of GPUs to use i.e. tensor parallelism, "
260                        "default=1")
261    parser.add_argument(
262        "--allow-cuda-graphs",
263        action='store_true',
264        help="Enables cuda graphs to be used, well remove a lot of the module "
265        "level info in the profiler results since almost everything runs in "
266        "the graph where we do not have access to an informative stack trace")
267
268    args = parser.parse_args()
269
270    context = ProfileContext(
271        **{
272            k: v
273            for k, v in vars(args).items()
274            if k in inspect.signature(ProfileContext).parameters
275        })
276    run_profile(context, csv_output=args.csv, json_output=args.json)