[Frontend] OpenAI API server: Do not add bos token by default when en…

…coding (#4688)
vllm-project · May 17, 2024 · 0150a10 · 0150a10
1 parent 8e7fb5d
commit 0150a10
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 12 deletions.
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -158,7 +158,7 @@ async def create_chat_completion(
  try:
  # Tokenize/detokenize depending on prompt format (string/token list)
  prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
- request, prompt=prompt)
+ request, prompt=prompt, add_special_tokens=False)
  sampling_params = request.to_sampling_params()
  lora_request = self._maybe_get_lora(request)
  decoding_config = await self.engine.get_decoding_config()

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -1,7 +1,7 @@
 import json
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from pydantic import Field
 from typing_extensions import Annotated
@@ -165,24 +165,34 @@ def _maybe_get_lora(
  raise ValueError(f"The model `{request.model}` does not exist.")
 
  def _validate_prompt_and_tokenize(
- self,
- request: Union[ChatCompletionRequest, CompletionRequest,
- EmbeddingRequest],
- prompt: Optional[str] = None,
- prompt_ids: Optional[List[int]] = None,
- truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
- ) -> Tuple[List[int], str]:
+ self,
+ request: Union[ChatCompletionRequest, CompletionRequest,
+ EmbeddingRequest],
+ prompt: Optional[str] = None,
+ prompt_ids: Optional[List[int]] = None,
+ truncate_prompt_tokens: Optional[Annotated[int,
+ Field(ge=1)]] = None,
+ add_special_tokens: bool = True) -> Tuple[List[int], str]:
  if not (prompt or prompt_ids):
  raise ValueError("Either prompt or prompt_ids should be provided.")
  if (prompt and prompt_ids):
  raise ValueError(
  "Only one of prompt or prompt_ids should be provided.")
 
  if prompt_ids is None:
- tokenizer_kwargs = {} if truncate_prompt_tokens is None else {
- "truncation": True,
- "max_length": truncate_prompt_tokens,
+ # When using OpenAIServingChat for chat completions, the
+ # special tokens (e.g., BOS) have already been added by the
+ # chat template. Therefore, we do not need to add them again.
+ # Set add_special_tokens to False to avoid adding the BOS tokens
+ # again.
+ tokenizer_kwargs: Dict[str, Any] = {
+ "add_special_tokens": add_special_tokens
  }
+ if truncate_prompt_tokens is not None:
+ tokenizer_kwargs.update({
+ "truncation": True,
+ "max_length": truncate_prompt_tokens,
+ })
  input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
  elif truncate_prompt_tokens is not None:
  input_ids = prompt_ids[-truncate_prompt_tokens:]