Offline Profile#

Source vllm-project/vllm.

  1import argparse
  2import inspect
  3import json
  4import sys
  5from dataclasses import asdict, dataclass
  6from typing import Optional
  7
  8import torch
  9
 10from vllm import LLM, SamplingParams
 11from vllm.profiler import nm_profile
 12
 13BATCH_SIZE_DEFAULT = 1
 14PROMPT_LEN_DEFAULT = 256
 15MAX_SEQ_LEN_DEFAULT = 1024
 16
 17
 18@dataclass
 19class ProfileContext:
 20    model: str
 21    model_revision: str
 22    sparsity: str
 23    quantization: str
 24    max_seq_len: int
 25    max_num_batched_tokens: int
 26    prompt_len: int
 27    batch_size: int
 28    tensor_parallel_size: int
 29    allow_cuda_graphs: bool
 30
 31
 32def run_profile(context: ProfileContext, csv_output: Optional[str],
 33                json_output: Optional[str]):
 34    print("Run profile with:")
 35    for key, value in asdict(context).items():
 36        print(f"  {key} = {value}")
 37
 38    # Create sampling params
 39    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=8)
 40
 41    # Create LLM
 42    llm = LLM(
 43        model=context.model,
 44        revision=context.model_revision,
 45        sparsity=context.sparsity,
 46        enforce_eager=not context.allow_cuda_graphs,
 47        tensor_parallel_size=context.tensor_parallel_size,
 48        gpu_memory_utilization=0.9,
 49        max_model_len=context.max_seq_len,
 50        quantization=context.quantization,
 51        max_num_batched_tokens=context.max_num_batched_tokens,
 52    )
 53
 54    batch_size = context.batch_size
 55    prompt_len = context.prompt_len
 56
 57    scheduler_config = llm.llm_engine.scheduler_config
 58    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
 59    max_num_seqs = scheduler_config.max_num_seqs
 60
 61    if batch_size * prompt_len > max_num_batched_tokens:
 62        print(f"ERROR: chosen batch_size * prompt_len "
 63              f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
 64              f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
 65              f"and therefore cannot be run in a single profile step, please "
 66              f"choose a smaller batch size or prompt length, or increase "
 67              f"--max_num_batched_tokens")
 68        sys.exit(-1)
 69    if batch_size >= max_num_seqs:
 70        print(
 71            f"ERROR: chosen batch_size ({batch_size}) is larger than "
 72            f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
 73            f"single profile step, please choose a smaller batch size")
 74        sys.exit(-1)
 75
 76    for i in range(batch_size):
 77        llm.llm_engine.add_request(
 78            request_id=f"seq{i}",
 79            prompt=None,
 80            prompt_token_ids=torch.randint(
 81                128,  # 128 to skip over special tokens
 82                llm.llm_engine.model_config.get_vocab_size() // 2,
 83                size=(prompt_len, )).tolist(),
 84            sampling_params=sampling_params)
 85
 86    with nm_profile() as prefill_prof:
 87        llm.llm_engine.step()  # First step is prefill
 88
 89    with nm_profile() as decode_prof:
 90        llm.llm_engine.step()
 91
 92    prefill_results = prefill_prof.results
 93    decode_results = decode_prof.results
 94
 95    print("=" * 80)
 96    print(f"= Prefill Model Table "
 97          f"(prompt_len={prompt_len}, batch_size={batch_size})")
 98    print("=" * 80)
 99    print()
100    prefill_results.print_model_table()
101    print()
102    print("=" * 80)
103    print(f"= Decode Model Table "
104          f"(prompt_len={prompt_len}, batch_size={batch_size})")
105    print("=" * 80)
106    print()
107    decode_results.print_model_table()
108    print()
109    print("=" * 80)
110    print(f"= Prefill Summary Table "
111          f"(prompt_len={prompt_len}, batch_size={batch_size})")
112    print("=" * 80)
113    print()
114    prefill_results.print_summary_table()
115    print()
116    print("=" * 80)
117    print(f"= Decode Summary Table "
118          f"(prompt_len={prompt_len}, batch_size={batch_size})")
119    print("=" * 80)
120    print()
121    decode_results.print_summary_table()
122
123    if csv_output:
124        csv_filename_base = csv_output.rstrip(".csv")
125        prefill_results.export_model_stats_table_csv(
126            csv_filename_base + "_prefill_model_table.csv")
127        prefill_results.export_summary_stats_table_csv(
128            csv_filename_base + "_prefill_summary_table.csv")
129        decode_results.export_model_stats_table_csv(\
130            csv_filename_base + "_decode_model_table.csv")
131        decode_results.export_summary_stats_table_csv(
132            csv_filename_base + "_decode_summary_table.csv")
133
134    if json_output:
135        cuda_devices = [
136            torch.cuda.get_device_properties(dev_idx)
137            for dev_idx in range(torch.cuda.device_count())
138        ]
139
140        json_dict = {
141            "context": {
142                "python_version": f"{sys.version}",
143                "torch_version": f"{torch.__version__}",
144                "torch_cuda_version": f"{torch.version.cuda}",
145                "cuda_devices": f"{cuda_devices}",
146                **asdict(context)
147            },
148            "prefill": prefill_results.convert_stats_to_dict(),
149            "decode": decode_results.convert_stats_to_dict()
150        }
151
152        with open(json_output.rstrip(".json") + ".json", "w+") as f:
153            json.dump(json_dict, f, indent=2)
154        pass
155
156
157if __name__ == "__main__":
158    parser = argparse.ArgumentParser()
159
160    parser.add_argument(
161        "--model",
162        type=str,
163        required=True,
164        help='The name or path of a HuggingFace Transformers model.')
165    parser.add_argument("--model-revision", type=str, default=None)
166    parser.add_argument(
167        "--csv",
168        type=str,
169        default=None,
170        help="Export the results as multiple csv file. This should be the root "
171        "filename, will create <filename>_prefill_model_table.csv, "
172        "<filename>_prefill_summary_table.csv, "
173        "<filename>_decode_model_table.csv, and "
174        "<filename>_decode_summary_table.csv")
175    parser.add_argument(
176        "--json",
177        type=str,
178        default=None,
179        help="Export the results as a json file. This should be the filename")
180    parser.add_argument(
181        "--sparsity",
182        "-s",
183        type=str,
184        choices=[None, 'sparse_w16a16', 'semi_structured_sparse_w16a16'],
185        help="Method used to compress sparse weights. If "
186        "None, we first check the `sparsity_config` attribute"
187        "in the model config file. If that is None we assume"
188        "the model weights are dense")
189    parser.add_argument(
190        "--quantization",
191        "-q",
192        type=str,
193        choices=['awq', 'gptq', 'squeezellm', 'marlin', None],
194        default=None,
195        help="The method used to quantize the model weights, "
196        "options are \"marlin\", \"awq\", \"gptq\" and \"squeezellm\"")
197    parser.add_argument(
198        "--max-seq-len",
199        type=int,
200        default=MAX_SEQ_LEN_DEFAULT,
201        help=f"Maximum length of a sequence (including prompt and output), "
202        f"default={MAX_SEQ_LEN_DEFAULT}")
203    parser.add_argument(
204        "--max-num-batched-tokens",
205        type=int,
206        default=None,
207        help="Maximum number of tokens to be processed in a single iteration. "
208        " Should be greater than batch-size * prompt-len so the prefill can "
209        " run in a single iteration.")
210    parser.add_argument(
211        "--prompt-len",
212        type=int,
213        default=PROMPT_LEN_DEFAULT,
214        help=f"Length of the random prompt to use when profiling, all batched "
215        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
216    parser.add_argument("--batch-size",
217                        type=int,
218                        default=BATCH_SIZE_DEFAULT,
219                        help=f"Number of requests to run as a single batch, "
220                        f"default={BATCH_SIZE_DEFAULT}")
221    parser.add_argument("--tensor-parallel-size",
222                        "-tp",
223                        type=int,
224                        default=1,
225                        help="Number of GPUs to use i.e. tensor parallelism, "
226                        "default=1")
227    parser.add_argument(
228        "--allow-cuda-graphs",
229        action='store_true',
230        help="Enables cuda graphs to be used, well remove a lot of the module "
231        "level info in the profiler results since almost everything runs in "
232        "the graph where we do not have access to an informative stack trace")
233
234    args = parser.parse_args()
235
236    context = ProfileContext(
237        **{
238            k: v
239            for k, v in vars(args).items()
240            if k in inspect.signature(ProfileContext).parameters
241        })
242    run_profile(context, csv_output=args.csv, json_output=args.json)