A deep-dive into how HTTP requests travel from the CLI through FastAPI routes, Pydantic validation, prompt construction, and into the AsyncLLM engine — based on reading the actual vLLM source code.
Source: vllm-project/vllm (V1 architecture)vllm serve Launches the Server
The journey from a user typing vllm serve meta-llama/Llama-3-8B to a running
FastAPI server involves three layers: the CLI dispatcher, the serve subcommand, and
the Uvicorn HTTP server wrapping a FastAPI application.
vllm/entrypoints/cli/main.py
The vllm command is a console script pointing at main().
It lazily imports all subcommand modules and registers them with argparse:
def main():
import vllm.entrypoints.cli.serve
import vllm.entrypoints.cli.launch
import vllm.entrypoints.cli.openai
import vllm.entrypoints.cli.run_batch
...
CMD_MODULES = [
vllm.entrypoints.cli.openai,
vllm.entrypoints.cli.serve,
vllm.entrypoints.cli.launch,
vllm.entrypoints.cli.benchmark.main,
vllm.entrypoints.cli.collect_env,
vllm.entrypoints.cli.run_batch,
]
cli_env_setup()
parser = FlexibleArgumentParser(description="vLLM CLI")
subparsers = parser.add_subparsers(required=False, dest="subparser")
cmds = {}
for cmd_module in CMD_MODULES:
new_cmds = cmd_module.cmd_init()
for cmd in new_cmds:
cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)
cmds[cmd.name] = cmd
args = parser.parse_args()
if args.subparser in cmds:
cmds[args.subparser].validate(args)
if hasattr(args, "dispatch_function"):
args.dispatch_function(args) # calls ServeSubcommand.cmd(args)
CLISubcommand class that implements
cmd(), validate(), and subparser_init(). The serve subcommand
is defined in vllm/entrypoints/cli/serve.py.
The ServeSubcommand.cmd() method handles three modes depending on
api_server_count: headless (no API servers), multi-server (data parallelism),
and single-server (the common case):
class ServeSubcommand(CLISubcommand):
name = "serve"
@staticmethod
def cmd(args: argparse.Namespace) -> None:
# If model is specified in CLI as positional arg, it takes precedence
if hasattr(args, "model_tag") and args.model_tag is not None:
args.model = args.model_tag
if args.api_server_count < 1:
run_headless(args) # No HTTP server, engine only
elif args.api_server_count > 1:
run_multi_api_server(args) # Multiple workers with shared engine
else:
# Single API server (this process) -- most common path
args.api_server_count = None
uvloop.run(run_server(args))
The single-server path goes through three phases: socket binding, engine creation, and HTTP serving:
vllm/entrypoints/openai/api_server.pyasync def run_server(args, **uvicorn_kwargs) -> None:
"""Run a single-worker API server."""
decorate_logs("APIServer")
listen_address, sock = setup_server(args) # Bind socket BEFORE engine
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
async def run_server_worker(listen_address, sock, args, ...) -> None:
async with build_async_engine_client(args) as engine_client:
shutdown_task = await build_and_serve(
engine_client, listen_address, sock, args, **uvicorn_kwargs
)
await shutdown_task # Wait until Uvicorn shuts down
sock.close()
In V1, the engine client is always AsyncLLM created in-process (no separate RPC
process). The build function creates the VllmConfig first, then instantiates
AsyncLLM.from_vllm_config():
async def build_async_engine_client_from_engine_args(
engine_args, *, usage_context, client_config,
) -> AsyncIterator[EngineClient]:
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
from vllm.v1.engine.async_llm import AsyncLLM
async_llm = AsyncLLM.from_vllm_config(
vllm_config=vllm_config,
usage_context=usage_context,
enable_log_requests=engine_args.enable_log_requests,
...
)
await async_llm.reset_mm_cache()
yield async_llm
The build_and_serve() function creates the FastAPI app, initializes state, then
hands everything to Uvicorn via serve_http():
async def build_and_serve(engine_client, listen_address, sock, args, **uvicorn_kwargs):
supported_tasks = await engine_client.get_supported_tasks()
model_config = engine_client.model_config
app = build_app(args, supported_tasks, model_config)
await init_app_state(engine_client, app.state, args, supported_tasks)
return await serve_http(
app, sock=sock,
host=args.host, port=args.port,
log_level=args.uvicorn_log_level,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
...
)
vllm/entrypoints/launcher.py
async def serve_http(app: FastAPI, sock, **uvicorn_kwargs):
# Log all available routes
for route in app.routes:
methods = getattr(route, "methods", None)
path = getattr(route, "path", None)
if methods and path:
logger.info("Route: %s, Methods: %s", path, ", ".join(methods))
config = uvicorn.Config(app, **uvicorn_kwargs)
config.h11_max_incomplete_event_size = h11_max_incomplete_event_size
config.load()
server = uvicorn.Server(config)
app.state.server = server
...
The build_app() function conditionally registers routers based on which
tasks the model supports. Each router is a separate module with an attach_router(app)
function that calls app.include_router(router):
def build_app(args, supported_tasks, model_config) -> FastAPI:
app = FastAPI(lifespan=lifespan)
# Always registered: vLLM serve routers (LoRA, profile, sleep, cache, tokenize)
register_vllm_serve_api_routers(app)
# Always registered: models endpoint
register_models_api_router(app)
# Conditionally registered based on supported_tasks:
if "generate" in supported_tasks:
register_generate_api_routers(app) # chat + completion + responses + anthropic
attach_disagg_router(app) # /v1/generate (disagg serving)
attach_rlhf_router(app) # RLHF endpoints
elastic_ep_attach_router(app) # Elastic EP
if "transcription" in supported_tasks:
register_speech_to_text_api_router(app)
if "realtime" in supported_tasks:
register_realtime_api_router(app) # WebSocket
if any(task in POOLING_TASKS for task in supported_tasks):
register_pooling_api_routers(app, supported_tasks, model_config)
The register_generate_api_routers() function in vllm/entrypoints/openai/generate/api_router.py
wires up the core OpenAI endpoints:
def register_generate_api_routers(app: FastAPI):
# 1. POST /v1/chat/completions, /v1/chat/completions/batch
register_chat_api_router(app)
# 2. POST /v1/responses (OpenAI Responses API)
register_responses_api_router(app)
# 3. POST /v1/completions
register_completion_api_router(app)
# 4. POST /v1/messages (Anthropic-compatible)
register_anthropic_api_router(app)
| Endpoint | Method | Module | Task Guard |
|---|---|---|---|
/v1/chat/completions |
POST | chat_completion/api_router.py |
"generate" |
/v1/chat/completions/batch |
POST | chat_completion/api_router.py |
"generate" |
/v1/completions |
POST | completion/api_router.py |
"generate" |
/v1/responses |
POST | responses/api_router.py |
"generate" |
/v1/messages |
POST | anthropic/api_router.py |
"generate" |
/v1/models |
GET | models/api_router.py |
Always |
/tokenize, /detokenize |
POST | serve/tokenize/api_router.py |
Always |
/v1/lora/load, /v1/lora/unload |
POST | serve/lora/api_router.py |
Always |
/v1/embeddings, /v1/score |
POST | pooling/ modules |
POOLING_TASKS |
/v1/audio/transcriptions |
POST | speech_to_text/api_router.py |
"transcription" |
After route registration, build_app() adds a layered middleware stack:
# CORS middleware
app.add_middleware(CORSMiddleware, allow_origins=args.allowed_origins, ...)
# API key authentication
if tokens:
app.add_middleware(AuthenticationMiddleware, tokens=tokens)
# Request ID tracking
if args.enable_request_id_headers:
app.add_middleware(XRequestIdMiddleware)
# Scaling state middleware (elastic EP)
app.add_middleware(ScalingMiddleware)
# Exception handlers
app.exception_handler(HTTPException)(http_exception_handler)
app.exception_handler(RequestValidationError)(validation_exception_handler)
app.exception_handler(EngineGenerateError)(engine_error_handler)
app.exception_handler(EngineDeadError)(engine_error_handler)
app.exception_handler(Exception)(exception_handler)
When a POST arrives at /v1/chat/completions, it flows through: FastAPI route
function → OpenAIServingChat.create_chat_completion() → rendering →
engine generation → streaming or non-streaming response.
router = APIRouter()
@router.post("/v1/chat/completions",
dependencies=[Depends(validate_json_request)])
@with_cancellation
@load_aware_call
async def create_chat_completion(
request: ChatCompletionRequest, # Pydantic auto-validates JSON body
raw_request: Request
):
handler = chat(raw_request) # -> request.app.state.openai_serving_chat
if handler is None:
raise NotImplementedError("Model does not support Chat Completions API")
generator = await handler.create_chat_completion(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(), status_code=...)
elif isinstance(generator, ChatCompletionResponse):
return JSONResponse(content=generator.model_dump())
# Streaming: return as text/event-stream (SSE)
return StreamingResponse(content=generator, media_type="text/event-stream")
@with_cancellation ensures the engine aborts if the client disconnects.
@load_aware_call tracks concurrent request count for load-based routing.
This is the central orchestration method. It coordinates model validation, prompt rendering, sampling parameter construction, and engine dispatch:
vllm/entrypoints/openai/chat_completion/serving.pyasync def create_chat_completion(self, request, raw_request):
# Step 1: Initialize reasoning parser (for models with chain-of-thought)
reasoning_parser = None
if self.reasoning_parser_cls:
reasoning_parser = self.reasoning_parser_cls(tokenizer, ...)
# Step 2: Validate model and render chat messages into engine input
result = await self.render_chat_request(request)
if isinstance(result, ErrorResponse):
return result
conversation, engine_inputs = result
# Step 3: Assign request ID and metadata
request_id = f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}"
request_metadata = RequestResponseMetadata(request_id=request_id)
# Step 4: Resolve LoRA adapter
lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
# Step 5: Build sampling parameters and submit to engine
for i, engine_input in enumerate(engine_inputs):
max_tokens = get_max_tokens(
max_model_len,
request.max_completion_tokens or request.max_tokens,
self._extract_prompt_len(engine_input),
self.default_sampling_params,
self.override_max_tokens,
)
sampling_params = request.to_sampling_params(max_tokens, self.default_sampling_params)
generator = self.engine_client.generate(
engine_input,
sampling_params,
sub_request_id,
lora_request=lora_request,
priority=request.priority,
data_parallel_rank=data_parallel_rank,
)
generators.append(generator)
# Step 6: Branch on streaming vs non-streaming
if request.stream:
return self.chat_completion_stream_generator(request, result_generator, ...)
return await self.chat_completion_full_generator(request, result_generator, ...)
The rendering step is delegated to OpenAIServingRender, which handles chat template
application, multimodal input resolution, and tokenization:
async def render_chat_request(self, request):
# Check model exists and is accessible
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
return error_check_ret
# Ensure engine is alive (important for streaming where 200 is sent early)
if self.engine_client.errored:
raise self.engine_client.dead_error
# Delegate to the rendering layer
return await self.openai_serving_render.render_chat(request)
The streaming generator yields SSE chunks as the engine produces tokens. It handles tool call parsing, reasoning extraction, logprobs, and usage tracking:
async def chat_completion_stream_generator(self, request, result_generator, ...):
created_time = int(time.time())
first_iteration = True
# First chunk: send role ("assistant") for each choice
async for res in result_generator:
if first_iteration:
for i in range(num_choices):
chunk = ChatCompletionStreamResponse(
id=request_id,
object="chat.completion.chunk",
choices=[ChatCompletionResponseStreamChoice(
index=i,
delta=DeltaMessage(role="assistant", content=""),
)],
model=model_name,
)
yield f"data: {chunk.model_dump_json()}\n\n"
first_iteration = False
# Subsequent chunks: delta text, tool calls, reasoning
for output in res.outputs:
... # Tool parser, reasoning parser, logprobs extraction
delta = DeltaMessage(content=delta_text)
chunk = ChatCompletionStreamResponse(
id=request_id, choices=[ChatCompletionResponseStreamChoice(
index=i, delta=delta, logprobs=logprobs,
finish_reason=output.finish_reason,
)], model=model_name,
)
yield f"data: {chunk.model_dump_json()}\n\n"
# Final chunk with usage information
if include_usage:
final_chunk = ChatCompletionStreamResponse(
id=request_id, choices=[], usage=final_usage_info, model=model_name,
)
yield f"data: {final_chunk.model_dump_json()}\n\n"
yield "data: [DONE]\n\n"
vLLM defines rich Pydantic models that closely mirror the OpenAI API specification
while adding vLLM-specific extensions. All models inherit from OpenAIBaseModel.
Located in vllm/entrypoints/openai/chat_completion/protocol.py, this model captures
the full OpenAI chat completion API plus vLLM extensions:
messages, model, temperature, top_p,
n, stream, stop, max_completion_tokens,
tools, tool_choice, response_format,
frequency_penalty, presence_penalty, seed,
logprobs, top_logprobs
top_k, min_p, repetition_penalty,
use_beam_search, length_penalty,
stop_token_ids, min_tokens,
allowed_token_ids, bad_words
chat_template, chat_template_kwargs,
add_generation_prompt, continue_final_message,
echo, documents (RAG),
media_io_kwargs, mm_processor_kwargs
structured_outputs (json/regex/choice),
reasoning_effort, thinking_token_budget,
priority, cache_salt,
kv_transfer_params, vllm_xargs,
repetition_detection
class ChatCompletionRequest(OpenAIBaseModel):
messages: list[ChatCompletionMessageParam]
model: str | None = None
temperature: float | None = None
stream: bool | None = False
tools: list[ChatCompletionToolsParam] | None = None
tool_choice: Literal["none"] | Literal["auto"] | ... = "none"
reasoning_effort: Literal["none", "low", "medium", "high"] | None = None
...
@model_validator(mode="before")
def validate_stream_options(cls, data):
if data.get("stream_options") and not data.get("stream"):
raise VLLMValidationError("Stream options only when stream=True")
return data
@model_validator(mode="before")
def check_tool_usage(cls, data):
# If tools provided but no tool_choice, default to "auto"
if "tool_choice" not in data and data.get("tools"):
data["tool_choice"] = "auto"
# Validate named tool_choice matches a defined tool
if isinstance(data["tool_choice"], dict):
... # cross-reference tool_choice.function.name with tools list
return data
@model_validator(mode="before")
def check_generation_prompt(cls, data):
if data.get("continue_final_message") and data.get("add_generation_prompt"):
raise ValueError("Cannot set both to True")
return data
def to_sampling_params(self, max_tokens, default_sampling_params) -> SamplingParams:
# Fall back to model's default_sampling_params from generation_config
if (temperature := self.temperature) is None:
temperature = default_sampling_params.get("temperature", 1.0)
if (top_p := self.top_p) is None:
top_p = default_sampling_params.get("top_p", 1.0)
...
return SamplingParams.from_optional(
n=self.n,
temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p,
seed=self.seed,
stop=self.stop, stop_token_ids=self.stop_token_ids,
max_tokens=max_tokens, min_tokens=self.min_tokens,
logprobs=self.top_logprobs if self.logprobs else None,
# DELTA for streaming, FINAL_ONLY for non-streaming
output_kind=RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY,
structured_outputs=self.structured_outputs,
skip_clone=True, # Fresh per request, safe to skip clone
)
class CompletionRequest(OpenAIBaseModel):
model: str | None = None
prompt: list[int] | list[list[int]] | str | list[str] | None = None
echo: bool | None = False
max_tokens: int | None = 16
n: int = 1
stream: bool | None = False
# vLLM extension: accept pre-computed embeddings
prompt_embeds: bytes | list[bytes] | None = None
...
class ChatCompletionResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
object: Literal["chat.completion"] = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: list[ChatCompletionResponseChoice]
usage: UsageInfo
# vLLM extensions
prompt_logprobs: list[dict[int, Logprob] | None] | None = None
kv_transfer_params: dict[str, Any] | None = None
class ChatCompletionStreamResponse(OpenAIBaseModel):
id: str
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
choices: list[ChatCompletionResponseStreamChoice]
usage: UsageInfo | None = None
class ChatMessage(OpenAIBaseModel):
role: str
content: str | None = None
reasoning: str | None = None # vLLM extension for chain-of-thought
tool_calls: list[ToolCall] = Field(default_factory=list)
vLLM supports images, audio, video, and pre-computed embeddings within chat messages.
The multimodal content types are defined in vllm/entrypoints/chat_utils.py as a
rich union type.
# The full union of all supported content part types
ChatCompletionContentPartParam: TypeAlias = (
OpenAIChatCompletionContentPartParam # text, image_url
| ChatCompletionContentPartAudioParam # {"audio_url": {"url": "..."}}
| ChatCompletionContentPartInputAudioParam # {"input_audio": {...}}
| ChatCompletionContentPartVideoParam # {"video_url": {"url": "..."}}
| ChatCompletionContentPartRefusalParam # refusal content
| CustomChatCompletionContentPILImageParam # {"image_pil": PIL.Image}
| CustomChatCompletionContentSimpleImageParam
| ChatCompletionContentPartImageEmbedsParam # pre-computed image embeds
| ChatCompletionContentPartAudioEmbedsParam # pre-computed audio embeds
| CustomChatCompletionContentSimpleAudioParam
| CustomChatCompletionContentSimpleVideoParam
| str # plain text shorthand
| CustomThinkCompletionContentParam # thinking/reasoning
)
class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
image_embeds: str | dict[str, str] | None
"""Base64-encoded image embeddings (single or dict)."""
type: Required[Literal["image_embeds"]]
uuid: str | None
"""User-provided UUID for deduplication."""
class ChatCompletionContentPartAudioEmbedsParam(TypedDict, total=False):
audio_embeds: str | dict[str, str] | None
"""Base64-encoded serialized torch tensor."""
type: Required[Literal["audio_embeds"]]
uuid: str | None
MODALITY_PLACEHOLDERS_MAP = {
"image": "<##IMAGE##>",
"audio": "<##AUDIO##>",
"video": "<##VIDEO##>",
}
These placeholders are inserted into the text during chat template rendering, then
replaced by the model's actual multimodal tokens during input processing. The
media_io_kwargs field on ChatCompletionRequest allows per-request configuration
of media connectors (e.g., custom image resizing):
# In ChatCompletionRequest:
media_io_kwargs: dict[str, dict[str, Any]] | None = Field(
default=None,
description="Additional kwargs to pass to media IO connectors, keyed by modality.",
)
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description="Additional kwargs to pass to the HF processor.",
)
A model validator warns when system messages contain non-text content parts, since this is not part of the OpenAI spec:
@model_validator(mode="before")
def check_system_message_content_type(cls, data):
for msg in data.get("messages", []):
if msg.get("role") == "system" and isinstance(msg.get("content"), list):
for part in msg["content"]:
part_type = part.get("type")
# Infer type from fields: image_url, audio_url, video_url, etc.
if part_type and part_type != "text":
logger.warning_once(
"System messages should only contain text content. Found: '%s'",
part_type,
)
return data
The LLM class in vllm/entrypoints/llm.py provides a synchronous Python API
for batch inference. It wraps the V1 LLMEngine and handles prompt preprocessing,
tokenization, and result collection.
class LLM:
"""An LLM for generating texts from given prompts and sampling parameters.
This class includes a tokenizer, a language model (possibly distributed
across multiple GPUs), and GPU memory space allocated for intermediate
states (aka KV cache). Given a batch of prompts and sampling parameters,
this class generates texts from the model, using an intelligent batching
mechanism and efficient memory management."""
def __init__(
self,
model: str,
*,
tokenizer: str | None = None,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
tensor_parallel_size: int = 1,
dtype: str = "auto",
quantization: str | None = None,
gpu_memory_utilization: float = 0.9,
...
**kwargs,
):
engine_args = EngineArgs(
model=model, tokenizer=tokenizer,
tensor_parallel_size=tensor_parallel_size,
dtype=dtype, quantization=quantization,
gpu_memory_utilization=gpu_memory_utilization,
...
)
self.llm_engine = LLMEngine.from_engine_args(
engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
)
self.model_config = self.llm_engine.model_config
self.renderer = self.llm_engine.renderer
self.chat_template = load_chat_template(chat_template)
def generate(
self,
prompts: PromptType | Sequence[PromptType],
sampling_params: SamplingParams | Sequence[SamplingParams] | None = None,
*,
use_tqdm: bool = True,
lora_request: LoRARequest | None = None,
priority: list[int] | None = None,
) -> list[RequestOutput]:
"""Generate completions for the input prompts.
This class automatically batches the given prompts, considering
the memory constraint. For best performance, put all prompts
into a single list and pass it to this method."""
if self.model_config.runner_type != "generate":
raise ValueError("LLM.generate() is only supported for generative models.")
if sampling_params is None:
sampling_params = self.get_default_sampling_params()
return self._run_completion(
prompts=prompts, params=sampling_params,
output_type=RequestOutput, use_tqdm=use_tqdm,
lora_request=lora_request, priority=priority,
)
V1 also supports an asynchronous enqueue-wait pattern for more control over batching:
# Enqueue requests without blocking
request_ids = llm.enqueue(prompts, sampling_params)
# Process queue and collect results
outputs = llm.wait_for_completion(use_tqdm=True)
The PromptType union allows flexible input formats:
"What is the capital of France?"{"prompt_token_ids": [1, 2048, ...]}{"prompt": "Describe this image:", "multi_modal_data": {"image": img}}{"prompt_embeds": tensor, ...}
Tokenization in vLLM V1 is handled through the Renderer abstraction, which wraps
HuggingFace tokenizers and processors. The tokenizer is loaded during engine
initialization and shared across all serving handlers.
The tokenizer is loaded as part of the model configuration. The tokenizer_mode
parameter controls whether to use the "fast" (Rust-based) or "slow" (Python-based)
tokenizer from HuggingFace:
# In LLM.__init__:
engine_args = EngineArgs(
model=model,
tokenizer=tokenizer, # Optional custom tokenizer path
tokenizer_mode="auto", # "auto" | "slow" | "fast" | "mistral"
skip_tokenizer_init=False,
trust_remote_code=trust_remote_code,
...
)
# The LLMEngine creates the renderer which holds the tokenizer
self.llm_engine = LLMEngine.from_engine_args(engine_args)
self.renderer = self.llm_engine.renderer
# Access tokenizer via renderer
tokenizer = self.renderer.tokenizer
# Or via the engine method:
tokenizer = self.llm_engine.get_tokenizer()
Each request builds a TokenizeParams object that controls how tokenization
is performed. The parameters vary between chat and completion requests:
# From ChatCompletionRequest.build_tok_params():
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
return TokenizeParams(
max_total_tokens=model_config.max_model_len,
max_output_tokens=self.max_completion_tokens or 0,
truncate_prompt_tokens=self.truncate_prompt_tokens,
add_special_tokens=self.add_special_tokens, # Default False for chat
needs_detokenization=bool(self.echo and not self.return_token_ids),
max_total_tokens_param="max_model_len",
max_output_tokens_param="max_completion_tokens",
)
# From CompletionRequest.build_tok_params():
def build_tok_params(self, model_config) -> TokenizeParams:
return TokenizeParams(
...
add_special_tokens=self.add_special_tokens, # Default True for completion
...
)
add_special_tokens=False because
the chat template handles special tokens. Completion requests default to True
to add BOS tokens automatically.
The renderers/inputs/preprocess.py module provides utilities for normalizing
prompts into the internal EngineInput format:
# Normalize single prompt or list of prompts
def prompt_to_seq(
prompt_or_prompts: PromptType | Sequence[PromptType],
) -> Sequence[PromptType]:
if isinstance(prompt_or_prompts, (dict, str, bytes)):
return [prompt_or_prompts] # Wrap single prompt in list
return prompt_or_prompts # Already a sequence
# Normalize single conversation or list of conversations
def conversation_to_seq(
conversation_or_conversations: ...,
) -> Sequence[list[ChatCompletionMessageParam]]:
if is_list_of(conversation_or_conversations, dict):
return [conversation_or_conversations]
return conversation_or_conversations
# vllm/renderers/inputs/preprocess.py
DecoderOnlyDictPrompt: TypeAlias = TextPrompt | TokensPrompt | EmbedsPrompt
EncoderDictPrompt: TypeAlias = TextPrompt | TokensPrompt
DecoderDictPrompt: TypeAlias = TextPrompt | TokensPrompt
class EncoderDecoderDictPrompt(TypedDict):
encoder_prompt: EncoderDictPrompt
decoder_prompt: DecoderDictPrompt | None
vLLM uses Jinja2-based chat templates (from HuggingFace tokenizer_config.json)
to convert a list of message objects into a single string or token sequence that the
model understands.
# load_chat_template() accepts:
# 1. A file path to a .jinja template
# 2. A Jinja template string
# 3. None (use model's built-in template)
resolved_chat_template = load_chat_template(args.chat_template)
# The template is stored in app state and passed to all serving handlers
state.openai_serving_chat = OpenAIServingChat(
...
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
trust_request_chat_template=args.trust_request_chat_template,
)
Clients can pass a custom chat_template in the request body. This is gated by the
--trust-request-chat-template flag for security:
class ChatCompletionRequest(OpenAIBaseModel):
chat_template: str | None = Field(
default=None,
description="A Jinja template to use for this conversion.",
)
chat_template_kwargs: dict[str, Any] | None = Field(
default=None,
description="Additional keyword args to pass to the template renderer.",
)
The request's template parameters are bundled into a ChatParams object that
the renderer uses:
def build_chat_params(self, default_template, default_template_content_format):
return ChatParams(
chat_template=self.chat_template or default_template,
chat_template_content_format=default_template_content_format,
chat_template_kwargs=merge_kwargs(
self.chat_template_kwargs,
dict(
add_generation_prompt=self.add_generation_prompt,
continue_final_message=self.continue_final_message,
documents=self.documents,
reasoning_effort=self.reasoning_effort,
),
),
media_io_kwargs=self.media_io_kwargs,
)
The chat_template_content_format controls how message content is represented
when passed to the template:
# Passed in by user (CLI or request)
ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
# After resolving "auto"
ChatTemplateContentFormat = Literal["string", "openai"]
# "string" - content is passed as a plain string to the template
# "openai" - content is passed as a list of {"type": ..., ...} objects
# "auto" - automatically detects the right format
class ConversationMessage(TypedDict, total=False):
role: Required[str]
content: str | None | list[dict[str, str]]
tool_call_id: str | None
name: str | None
tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
reasoning: str | None # interleaved thinking
reasoning_content: str | None # deprecated alias
tools: list[ChatCompletionFunctionToolParam] | None
The init_app_state() function is where all the serving handler classes are
instantiated and stored on app.state. This is the central wiring point:
async def init_app_state(engine_client, state, args, supported_tasks):
vllm_config = engine_client.vllm_config
# 1. Model registry
state.openai_serving_models = OpenAIServingModels(
engine_client=engine_client,
base_model_paths=[BaseModelPath(name=name, model_path=args.model)
for name in served_model_names],
lora_modules=lora_modules,
)
await state.openai_serving_models.init_static_loras()
# 2. Render service (chat template + tokenization)
state.openai_serving_render = OpenAIServingRender(
model_config=engine_client.model_config,
renderer=engine_client.renderer,
io_processor=engine_client.io_processor,
model_registry=state.openai_serving_models.registry,
chat_template=resolved_chat_template,
...
)
# 3. Tokenization endpoint service
state.openai_serving_tokenization = OpenAIServingTokenization(
engine_client, state.openai_serving_models, state.openai_serving_render,
...
)
# 4. Generate endpoints (chat, completion, responses, anthropic)
if "generate" in supported_tasks:
await init_generate_state(engine_client, state, args, request_logger, supported_tasks)
vllm/entrypoints/openai/generate/api_router.py
async def init_generate_state(engine_client, state, args, request_logger, supported_tasks):
# Chat handler with tool parsing and reasoning support
state.openai_serving_chat = OpenAIServingChat(
engine_client=engine_client,
models=state.openai_serving_models,
response_role=args.response_role,
openai_serving_render=state.openai_serving_render,
chat_template=resolved_chat_template,
enable_auto_tools=args.enable_auto_tool_choice,
tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
...
)
# Completion handler
state.openai_serving_completion = OpenAIServingCompletion(
engine_client=engine_client,
models=state.openai_serving_models,
openai_serving_render=state.openai_serving_render,
...
)
# Responses API handler (OpenAI's newer API)
state.openai_serving_responses = OpenAIServingResponses(
engine_client, state.openai_serving_models, state.openai_serving_render,
...
)
# Anthropic-compatible handler
state.anthropic_serving_messages = AnthropicServingMessages(
engine_client, state.openai_serving_models, ...
)
All serving handlers (OpenAIServingChat, OpenAIServingCompletion,
OpenAIServingResponses) inherit from OpenAIServing, which provides shared
infrastructure:
class OpenAIServing:
def __init__(self, engine_client, models, *, request_logger, return_tokens_as_token_ids):
self.engine_client = engine_client # AsyncLLM instance
self.models = models # OpenAIServingModels
self.request_logger = request_logger
self.return_tokens_as_token_ids = return_tokens_as_token_ids
# Derived from engine client
self.model_config = engine_client.model_config
self.renderer = engine_client.renderer # Holds tokenizer
self.io_processor = engine_client.io_processor
self.input_processor = engine_client.input_processor
# Shared methods available to all handlers:
async def _check_model(self, request): ... # Validate model name + LoRA
def _maybe_get_adapters(self, request): ... # Resolve LoRA adapter
async def beam_search(self, prompt, ...): ... # Beam search implementation
def _log_inputs(self, request_id, ...): ... # Request logging
def create_error_response(self, msg): ... # Error response factory
def create_streaming_error_response(self, e): ...# Streaming error
# All request types that go through the completion path:
CompletionLikeRequest: TypeAlias = (
CompletionRequest | TokenizeCompletionRequest |
DetokenizeRequest | RerankRequest |
ScoreRequest | PoolingCompletionRequest
)
# All request types that go through the chat path:
ChatLikeRequest: TypeAlias = (
ChatCompletionRequest | BatchChatCompletionRequest |
TokenizeChatRequest | PoolingChatRequest
)
# The full union of all request types:
AnyRequest: TypeAlias = (
CompletionLikeRequest | ChatLikeRequest |
SpeechToTextRequest | ResponsesRequest |
IOProcessorRequest | GenerateRequest
)
The completion handler follows a similar pattern to chat but with simpler prompt
processing (no chat template). It supports multiple prompts in a single request
via merge_async_iterators():
class OpenAIServingCompletion(OpenAIServing):
async def create_completion(self, request, raw_request):
# 1. Render (tokenize the prompt without chat template)
result = await self.render_completion_request(request)
if isinstance(result, ErrorResponse):
return result
engine_inputs = result
# 2. Create generators for each prompt
generators = []
for i, engine_input in enumerate(engine_inputs):
max_tokens = get_max_tokens(
max_model_len, request.max_tokens,
self._extract_prompt_len(engine_input),
self.default_sampling_params, self.override_max_tokens,
)
sampling_params = request.to_sampling_params(
max_tokens, self.default_sampling_params)
generator = self.engine_client.generate(
engine_input, sampling_params, request_id_item,
lora_request=lora_request, priority=request.priority,
)
generators.append(generator)
# 3. Merge generators and branch on streaming
result_generator = merge_async_iterators(*generators)
if request.stream:
return self.completion_stream_generator(...)
# 4. Non-streaming: collect all results
final_res_batch = [None] * num_prompts
async for i, res in result_generator:
final_res_batch[i] = res
return self.request_output_to_completion_response(final_res_batch, ...)
| File | Purpose |
|---|---|
entrypoints/cli/main.py |
CLI dispatcher: parses vllm [subcommand], routes to handler |
entrypoints/cli/serve.py |
vllm serve command: single/multi/headless server launch |
entrypoints/openai/api_server.py |
FastAPI app construction, engine client creation, state initialization |
entrypoints/launcher.py |
Uvicorn server wrapper with SSL and header limit support |
entrypoints/openai/generate/api_router.py |
Registers chat, completion, responses, and anthropic routers |
entrypoints/openai/chat_completion/api_router.py |
/v1/chat/completions route function |
entrypoints/openai/chat_completion/serving.py |
OpenAIServingChat: full chat completion logic, streaming, tool parsing |
entrypoints/openai/chat_completion/protocol.py |
Pydantic models: ChatCompletionRequest, response types |
entrypoints/openai/completion/api_router.py |
/v1/completions route function |
entrypoints/openai/completion/serving.py |
OpenAIServingCompletion: text completion logic |
entrypoints/openai/completion/protocol.py |
Pydantic models: CompletionRequest, response types |
entrypoints/openai/engine/serving.py |
OpenAIServing base class: shared logic, beam search, model check |
entrypoints/chat_utils.py |
Chat message types, multimodal content part types, template loading |
entrypoints/llm.py |
LLM class for offline batch inference |
renderers/inputs/preprocess.py |
Prompt normalization, type parsing, encoder-decoder handling |
entrypoints/serve/__init__.py |
Registers always-on routers: LoRA, profile, sleep, cache, tokenize |
scripts.py |
Deprecated shim redirecting to entrypoints/cli/main.py |