diff --git a/agenthub/monologue_agent/agent.py b/agenthub/monologue_agent/agent.py
index 817dd638989..a9451b8c0f4 100644
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@@ -32,7 +32,7 @@
 if config.get(ConfigType.AGENT_MEMORY_ENABLED):
     from agenthub.monologue_agent.utils.memory import LongTermMemory
 
-MAX_MONOLOGUE_LENGTH = 20000
+MAX_TOKEN_COUNT_PADDING = 512
 MAX_OUTPUT_LENGTH = 5000
 
 INITIAL_THOUGHTS = [
@@ -128,7 +128,17 @@ def _add_event(self, event: dict):
         self.monologue.add_event(event)
         if self.memory is not None:
             self.memory.add_event(event)
-        if self.monologue.get_total_length() > MAX_MONOLOGUE_LENGTH:
+
+        # Test monologue token length
+        prompt = prompts.get_request_action_prompt(
+            '',
+            self.monologue.get_thoughts(),
+            [],
+        )
+        messages = [{'content': prompt, 'role': 'user'}]
+        token_count = self.llm.get_token_count(messages)
+
+        if token_count + MAX_TOKEN_COUNT_PADDING > self.llm.max_input_tokens:
             self.monologue.condense(self.llm)
 
     def _initialize(self, task: str):
diff --git a/opendevin/core/config.py b/opendevin/core/config.py
index 5a4bb8ea693..6d8c45124e0 100644
--- a/opendevin/core/config.py
+++ b/opendevin/core/config.py
@@ -22,6 +22,7 @@
 DEFAULT_CONFIG: dict = {
     ConfigType.LLM_API_KEY: None,
     ConfigType.LLM_BASE_URL: None,
+    ConfigType.LLM_CUSTOM_LLM_PROVIDER: None,
     ConfigType.WORKSPACE_BASE: os.getcwd(),
     ConfigType.WORKSPACE_MOUNT_PATH: None,
     ConfigType.WORKSPACE_MOUNT_PATH_IN_SANDBOX: '/workspace',
@@ -38,10 +39,11 @@
     ConfigType.LLM_RETRY_MIN_WAIT: 3,
     ConfigType.LLM_RETRY_MAX_WAIT: 60,
     ConfigType.MAX_ITERATIONS: 100,
+    ConfigType.LLM_MAX_INPUT_TOKENS: None,
+    ConfigType.LLM_MAX_OUTPUT_TOKENS: None,
     ConfigType.AGENT_MEMORY_MAX_THREADS: 2,
     ConfigType.AGENT_MEMORY_ENABLED: False,
     ConfigType.LLM_TIMEOUT: None,
-    ConfigType.LLM_MAX_RETURN_TOKENS: None,
     ConfigType.LLM_TEMPERATURE: None,
     ConfigType.LLM_TOP_P: None,
     # GPT-4 pricing is $10 per 1M input tokens. Since tokenization happens on LLM side,
diff --git a/opendevin/core/schema/config.py b/opendevin/core/schema/config.py
index ccde824df11..b99d98b938a 100644
--- a/opendevin/core/schema/config.py
+++ b/opendevin/core/schema/config.py
@@ -2,9 +2,11 @@
 
 
 class ConfigType(str, Enum):
+    LLM_CUSTOM_LLM_PROVIDER = 'LLM_CUSTOM_LLM_PROVIDER'
+    LLM_MAX_INPUT_TOKENS = 'LLM_MAX_INPUT_TOKENS'
+    LLM_MAX_OUTPUT_TOKENS = 'LLM_MAX_OUTPUT_TOKENS'
     LLM_TOP_P = 'LLM_TOP_P'
     LLM_TEMPERATURE = 'LLM_TEMPERATURE'
-    LLM_MAX_RETURN_TOKENS = 'LLM_MAX_RETURN_TOKENS'
     LLM_TIMEOUT = 'LLM_TIMEOUT'
     LLM_API_KEY = 'LLM_API_KEY'
     LLM_BASE_URL = 'LLM_BASE_URL'
diff --git a/opendevin/llm/llm.py b/opendevin/llm/llm.py
index f9ce38dc19c..e3f9fe39ffa 100644
--- a/opendevin/llm/llm.py
+++ b/opendevin/llm/llm.py
@@ -1,5 +1,6 @@
 from functools import partial
 
+import litellm
 from litellm import completion as litellm_completion
 from litellm.exceptions import (
     APIConnectionError,
@@ -25,8 +26,10 @@
 LLM_NUM_RETRIES = config.get(ConfigType.LLM_NUM_RETRIES)
 LLM_RETRY_MIN_WAIT = config.get(ConfigType.LLM_RETRY_MIN_WAIT)
 LLM_RETRY_MAX_WAIT = config.get(ConfigType.LLM_RETRY_MAX_WAIT)
+LLM_MAX_INPUT_TOKENS = config.get(ConfigType.LLM_MAX_INPUT_TOKENS)
+LLM_MAX_OUTPUT_TOKENS = config.get(ConfigType.LLM_MAX_OUTPUT_TOKENS)
+LLM_CUSTOM_LLM_PROVIDER = config.get(ConfigType.LLM_CUSTOM_LLM_PROVIDER)
 LLM_TIMEOUT = config.get(ConfigType.LLM_TIMEOUT)
-LLM_MAX_RETURN_TOKENS = config.get(ConfigType.LLM_MAX_RETURN_TOKENS)
 LLM_TEMPERATURE = config.get(ConfigType.LLM_TEMPERATURE)
 LLM_TOP_P = config.get(ConfigType.LLM_TOP_P)
 
@@ -45,8 +48,10 @@ def __init__(
         num_retries=LLM_NUM_RETRIES,
         retry_min_wait=LLM_RETRY_MIN_WAIT,
         retry_max_wait=LLM_RETRY_MAX_WAIT,
+        max_input_tokens=LLM_MAX_INPUT_TOKENS,
+        max_output_tokens=LLM_MAX_OUTPUT_TOKENS,
+        custom_llm_provider=LLM_CUSTOM_LLM_PROVIDER,
         llm_timeout=LLM_TIMEOUT,
-        llm_max_return_tokens=LLM_MAX_RETURN_TOKENS,
         llm_temperature=LLM_TEMPERATURE,
         llm_top_p=LLM_TOP_P,
     ):
@@ -59,8 +64,10 @@ def __init__(
             num_retries (int, optional): The number of retries for API calls. Defaults to LLM_NUM_RETRIES.
             retry_min_wait (int, optional): The minimum time to wait between retries in seconds. Defaults to LLM_RETRY_MIN_TIME.
             retry_max_wait (int, optional): The maximum time to wait between retries in seconds. Defaults to LLM_RETRY_MAX_TIME.
+            max_input_tokens (int, optional): The maximum number of tokens to send to the LLM per task. Defaults to LLM_MAX_INPUT_TOKENS.
+            max_output_tokens (int, optional): The maximum number of tokens to receive from the LLM per task. Defaults to LLM_MAX_OUTPUT_TOKENS.
+            custom_llm_provider (str, optional): A custom LLM provider. Defaults to LLM_CUSTOM_LLM_PROVIDER.
             llm_timeout (int, optional): The maximum time to wait for a response in seconds. Defaults to LLM_TIMEOUT.
-            llm_max_return_tokens (int, optional): The maximum number of tokens to return. Defaults to LLM_MAX_RETURN_TOKENS.
 
         Attributes:
             model_name (str): The name of the language model.
@@ -73,8 +80,33 @@ def __init__(
         self.api_key = api_key
         self.base_url = base_url
         self.api_version = api_version
+        self.max_input_tokens = max_input_tokens
+        self.max_output_tokens = max_output_tokens
         self.llm_timeout = llm_timeout
-        self.llm_max_return_tokens = llm_max_return_tokens
+        self.custom_llm_provider = custom_llm_provider
+
+        # litellm actually uses base Exception here for unknown model
+        self.model_info = None
+        try:
+            self.model_info = litellm.get_model_info(self.model_name)
+        # noinspection PyBroadException
+        except Exception:
+            logger.warning(f'Could not get model info for {self.model_name}')
+
+        if self.max_input_tokens is None:
+            if self.model_info is not None and 'max_input_tokens' in self.model_info:
+                self.max_input_tokens = self.model_info['max_input_tokens']
+            else:
+                # Max input tokens for gpt3.5, so this is a safe fallback for any potentially viable model
+                self.max_input_tokens = 4096
+
+        if self.max_output_tokens is None:
+            if self.model_info is not None and 'max_output_tokens' in self.model_info:
+                self.max_output_tokens = self.model_info['max_output_tokens']
+            else:
+                # Enough tokens for most output actions, and not too many for a bad llm to get carried away responding
+                # with thousands of unwanted tokens
+                self.max_output_tokens = 1024
 
         self._completion = partial(
             litellm_completion,
@@ -82,7 +114,8 @@ def __init__(
             api_key=self.api_key,
             base_url=self.base_url,
             api_version=self.api_version,
-            max_tokens=self.llm_max_return_tokens,
+            custom_llm_provider=custom_llm_provider,
+            max_tokens=self.max_output_tokens,
             timeout=self.llm_timeout,
             temperature=llm_temperature,
             top_p=llm_top_p,
@@ -129,6 +162,18 @@ def completion(self):
         """
         return self._completion
 
+    def get_token_count(self, messages):
+        """
+        Get the number of tokens in a list of messages.
+
+        Args:
+            messages (list): A list of messages.
+
+        Returns:
+            int: The number of tokens.
+        """
+        return litellm.token_counter(model=self.model_name, messages=messages)
+
     def __str__(self):
         if self.api_version:
             return f'LLM(model={self.model_name}, api_version={self.api_version}, base_url={self.base_url})'