Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Token counting and litellm provider customization #1421

Merged
merged 17 commits into from May 5, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 12 additions & 2 deletions agenthub/monologue_agent/agent.py
Expand Up @@ -32,7 +32,7 @@
if config.get(ConfigType.AGENT_MEMORY_ENABLED):
from agenthub.monologue_agent.utils.memory import LongTermMemory

MAX_MONOLOGUE_LENGTH = 20000
MAX_TOKEN_COUNT_PADDING = 512
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this roughly a similar number? 40 chars per token? That seems like a lot to me.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh nvm--I see how it's being used differently

MAX_OUTPUT_LENGTH = 5000

INITIAL_THOUGHTS = [
Expand Down Expand Up @@ -124,7 +124,17 @@ def _add_event(self, event: dict):
self.monologue.add_event(event)
if self.memory is not None:
self.memory.add_event(event)
if self.monologue.get_total_length() > MAX_MONOLOGUE_LENGTH:

# Test monologue token length
prompt = prompts.get_request_action_prompt(
'',
self.monologue.get_thoughts(),
[],
)
messages = [{'content': prompt, 'role': 'user'}]
token_count = self.llm.get_token_count(messages)

if token_count + MAX_TOKEN_COUNT_PADDING > self.llm.max_input_tokens:
self.monologue.condense(self.llm)

def _initialize(self, task: str):
Expand Down
3 changes: 3 additions & 0 deletions opendevin/config.py
Expand Up @@ -21,6 +21,7 @@
DEFAULT_CONFIG: dict = {
ConfigType.LLM_API_KEY: None,
ConfigType.LLM_BASE_URL: None,
ConfigType.LLM_CUSTOM_LLM_PROVIDER: 'openai',
enyst marked this conversation as resolved.
Show resolved Hide resolved
ConfigType.WORKSPACE_BASE: os.getcwd(),
ConfigType.WORKSPACE_MOUNT_PATH: None,
ConfigType.WORKSPACE_MOUNT_PATH_IN_SANDBOX: '/workspace',
Expand All @@ -36,6 +37,8 @@
ConfigType.LLM_RETRY_MIN_WAIT: 3,
ConfigType.LLM_RETRY_MAX_WAIT: 60,
ConfigType.MAX_ITERATIONS: 100,
ConfigType.LLM_MAX_INPUT_TOKENS: None,
ConfigType.LLM_MAX_OUTPUT_TOKENS: None,
ConfigType.AGENT_MEMORY_MAX_THREADS: 2,
ConfigType.AGENT_MEMORY_ENABLED: False,
# GPT-4 pricing is $10 per 1M input tokens. Since tokenization happens on LLM side,
Expand Down
47 changes: 46 additions & 1 deletion opendevin/llm/llm.py
@@ -1,4 +1,5 @@
from litellm import completion as litellm_completion
import litellm
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
from litellm.exceptions import APIConnectionError, RateLimitError, ServiceUnavailableError
from functools import partial
Expand All @@ -16,6 +17,9 @@
LLM_NUM_RETRIES = config.get(ConfigType.LLM_NUM_RETRIES)
LLM_RETRY_MIN_WAIT = config.get(ConfigType.LLM_RETRY_MIN_WAIT)
LLM_RETRY_MAX_WAIT = config.get(ConfigType.LLM_RETRY_MAX_WAIT)
LLM_MAX_INPUT_TOKENS = config.get(ConfigType.LLM_MAX_INPUT_TOKENS)
LLM_MAX_OUTPUT_TOKENS = config.get(ConfigType.LLM_MAX_OUTPUT_TOKENS)
LLM_CUSTOM_LLM_PROVIDER = config.get(ConfigType.LLM_CUSTOM_LLM_PROVIDER)


class LLM:
Expand All @@ -31,6 +35,9 @@ def __init__(self,
num_retries=LLM_NUM_RETRIES,
retry_min_wait=LLM_RETRY_MIN_WAIT,
retry_max_wait=LLM_RETRY_MAX_WAIT,
max_input_tokens=LLM_MAX_INPUT_TOKENS,
max_output_tokens=LLM_MAX_OUTPUT_TOKENS,
custom_llm_provider=LLM_CUSTOM_LLM_PROVIDER
):
"""
Args:
Expand All @@ -41,6 +48,9 @@ def __init__(self,
num_retries (int, optional): The number of retries for API calls. Defaults to LLM_NUM_RETRIES.
retry_min_wait (int, optional): The minimum time to wait between retries in seconds. Defaults to LLM_RETRY_MIN_TIME.
retry_max_wait (int, optional): The maximum time to wait between retries in seconds. Defaults to LLM_RETRY_MAX_TIME.
max_input_tokens (int, optional): The maximum number of tokens to send to and receive from LLM per task. Defaults to LLM_MAX_INPUT_TOKENS.
computer-whisperer marked this conversation as resolved.
Show resolved Hide resolved
max_output_tokens (int, optional): The maximum number of tokens to send to and receive from LLM per task. Defaults to LLM_MAX_OUTPUT_TOKENS.
computer-whisperer marked this conversation as resolved.
Show resolved Hide resolved
custom_llm_provider (function, optional): A custom LLM provider. Defaults to LLM_CUSTOM_LLM_PROVIDER.

Attributes:
model_name (str): The name of the language model.
Expand All @@ -54,9 +64,32 @@ def __init__(self,
self.api_key = api_key
self.base_url = base_url
self.api_version = api_version
self.max_input_tokens = max_input_tokens
self.max_output_tokens = max_output_tokens
self.custom_llm_provider = custom_llm_provider

# litellm actually uses base Exception here for unknown model
self.model_info = None
try:
self.model_info = litellm.get_model_info(self.model_name)
# noinspection PyBroadException
except Exception:
logger.warning(f'Could not get model info for {self.model_name}')

if self.max_input_tokens is None:
if self.model_info is not None and 'max_input_tokens' in self.model_info:
self.max_input_tokens = self.model_info['max_input_tokens']
else:
self.max_input_tokens = 4096

if self.max_output_tokens is None:
if self.model_info is not None and 'max_output_tokens' in self.model_info:
self.max_output_tokens = self.model_info['max_output_tokens']
else:
self.max_output_tokens = 1024
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious: where does this number come from? I guess 4096 is because it's the limit of GPT 3.5, but how about this one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a significant justification for either of these defaults, and I am interested to hear opinions on them. I regularly experienced overruns with a 512 output token limit, and therefore I usually use 1024 or higher locally.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a strong opinion either. I just feel like it would be better to have some comments explaining where these numbers are from.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added comments documenting this:

# Max input tokens for gpt3.5, so this is a safe fallback for any potentially viable model
self.max_input_tokens = 4096

# Enough tokens for most output actions, and not too many for a bad llm to get carried away responding
# with thousands of unwanted tokens
self.max_output_tokens = 1024


self._completion = partial(
litellm_completion, model=self.model_name, api_key=self.api_key, base_url=self.base_url, api_version=self.api_version)
litellm_completion, model=self.model_name, api_key=self.api_key, base_url=self.base_url, api_version=self.api_version, max_tokens=max_output_tokens, custom_llm_provider=custom_llm_provider)

completion_unwrapped = self._completion

Expand Down Expand Up @@ -89,6 +122,18 @@ def completion(self):
"""
return self._completion

def get_token_count(self, messages):
"""
Get the number of tokens in a list of messages.

Args:
messages (list): A list of messages.

Returns:
int: The number of tokens.
"""
return litellm.token_counter(model=self.model_name, messages=messages)

def __str__(self):
if self.api_version:
return f'LLM(model={self.model_name}, api_version={self.api_version}, base_url={self.base_url})'
Expand Down
3 changes: 3 additions & 0 deletions opendevin/schema/config.py
Expand Up @@ -2,6 +2,9 @@


class ConfigType(str, Enum):
LLM_CUSTOM_LLM_PROVIDER = 'LLM_CUSTOM_LLM_PROVIDER'
LLM_MAX_INPUT_TOKENS = 'LLM_MAX_INPUT_TOKENS'
LLM_MAX_OUTPUT_TOKENS = 'LLM_MAX_OUTPUT_TOKENS'
LLM_API_KEY = 'LLM_API_KEY'
LLM_BASE_URL = 'LLM_BASE_URL'
WORKSPACE_BASE = 'WORKSPACE_BASE'
Expand Down