Skip to content

vision_agent.agent

vision_agent.agent.agent.Agent

Bases: ABC

log_progress abstractmethod

log_progress(data)

Log the progress of the agent. This is a hook that is intended for reporting the progress of the agent.

Source code in vision_agent/agent/agent.py
@abstractmethod
def log_progress(self, data: Dict[str, Any]) -> None:
    """Log the progress of the agent.
    This is a hook that is intended for reporting the progress of the agent.
    """
    pass

vision_agent.agent.vision_agent.VisionAgent

VisionAgent(
    agent=None,
    cwd=None,
    verbosity=0,
    callback_message=None,
    code_sandbox_runtime=None,
)

Bases: Agent

Vision Agent is an agent that can chat with the user and call tools or other agents to generate code for it. Vision Agent uses python code to execute actions for the user. Vision Agent is inspired by by OpenDevin https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030

Example

>>> from vision_agent.agent import VisionAgent
>>> agent = VisionAgent()
>>> resp = agent("Hello")
>>> resp.append({"role": "user", "content": "Can you write a function that counts dogs?", "media": ["dog.jpg"]})
>>> resp = agent(resp)

Initialize the VisionAgent.

PARAMETER DESCRIPTION
agent

The agent to use for conversation and orchestration of other agents.

TYPE: Optional[LMM] DEFAULT: None

verbosity

The verbosity level of the agent.

TYPE: int DEFAULT: 0

callback_message

Callback function to send intermediate update messages.

TYPE: Optional[Callable[[Dict[str, Any]], None]] DEFAULT: None

code_sandbox_runtime

For string values it can be one of: None, "local" or "e2b". If None, it will read from the environment variable "CODE_SANDBOX_RUNTIME".

TYPE: Optional[str] DEFAULT: None

Source code in vision_agent/agent/vision_agent.py
def __init__(
    self,
    agent: Optional[LMM] = None,
    cwd: Optional[Union[Path, str]] = None,
    verbosity: int = 0,
    callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
    code_sandbox_runtime: Optional[str] = None,
) -> None:
    """Initialize the VisionAgent.

    Parameters:
        agent (Optional[LMM]): The agent to use for conversation and orchestration
            of other agents.
        verbosity (int): The verbosity level of the agent.
        callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
            function to send intermediate update messages.
        code_sandbox_runtime (Optional[str]): For string values it can be one of:
            None, "local" or "e2b". If None, it will read from the environment
            variable "CODE_SANDBOX_RUNTIME".
    """

    self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
    self.max_iterations = 12
    self.cwd = Path(cwd) if cwd is not None else Path.cwd()
    self.verbosity = verbosity
    self.callback_message = callback_message
    self.code_sandbox_runtime = code_sandbox_runtime
    if self.verbosity >= 1:
        _LOGGER.setLevel(logging.INFO)

agent instance-attribute

agent = (
    AnthropicLMM(temperature=0.0)
    if agent is None
    else agent
)

max_iterations instance-attribute

max_iterations = 12

cwd instance-attribute

cwd = Path(cwd) if cwd is not None else cwd()

verbosity instance-attribute

verbosity = verbosity

callback_message instance-attribute

callback_message = callback_message

code_sandbox_runtime instance-attribute

code_sandbox_runtime = code_sandbox_runtime

chat

chat(chat)

Chat with VisionAgent, it will use code to execute actions to accomplish its tasks.

PARAMETER DESCRIPTION
chat

A conversation in the format of: [{"role": "user", "content": "describe your task here..."}] or if it contains media files, it should be in the format of: [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]

TYPE: List[Message]

RETURNS DESCRIPTION
List[Message]

List[Message]: The conversation response.

Source code in vision_agent/agent/vision_agent.py
def chat(
    self,
    chat: List[Message],
) -> List[Message]:
    """Chat with VisionAgent, it will use code to execute actions to accomplish
    its tasks.

    Parameters:
        chat (List[Message]): A conversation in the format of:
            [{"role": "user", "content": "describe your task here..."}]
            or if it contains media files, it should be in the format of:
            [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]

    Returns:
        List[Message]: The conversation response.
    """
    return self.chat_with_artifacts(chat)[0]

chat_with_artifacts

chat_with_artifacts(
    chat,
    artifacts=None,
    test_multi_plan=True,
    custom_tool_names=None,
)

Chat with VisionAgent, it will use code to execute actions to accomplish its tasks.

PARAMETER DESCRIPTION
chat

A conversation in the format of: [{"role": "user", "content": "describe your task here..."}] or if it contains media files, it should be in the format of: [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]

TYPE: List[Message]

artifacts

The artifacts to use in the task.

TYPE: Optional[Artifacts] DEFAULT: None

test_multi_plan

If True, it will test tools for multiple plans and pick the best one based off of the tool results. If False, it will go with the first plan.

TYPE: bool DEFAULT: True

custom_tool_names

A list of customized tools for agent to pick and use. If not provided, default to full tool set from vision_agent.tools.

TYPE: List[str] DEFAULT: None

RETURNS DESCRIPTION
Tuple[List[Message], Artifacts]

List[Message]: The conversation response.

Source code in vision_agent/agent/vision_agent.py
def chat_with_artifacts(
    self,
    chat: List[Message],
    artifacts: Optional[Artifacts] = None,
    test_multi_plan: bool = True,
    custom_tool_names: Optional[List[str]] = None,
) -> Tuple[List[Message], Artifacts]:
    """Chat with VisionAgent, it will use code to execute actions to accomplish
    its tasks.

    Parameters:
        chat (List[Message]): A conversation in the format of:
            [{"role": "user", "content": "describe your task here..."}]
            or if it contains media files, it should be in the format of:
            [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
        artifacts (Optional[Artifacts]): The artifacts to use in the task.
        test_multi_plan (bool): If True, it will test tools for multiple plans and
            pick the best one based off of the tool results. If False, it will go
            with the first plan.
        custom_tool_names (List[str]): A list of customized tools for agent to
            pick and use. If not provided, default to full tool set from
            vision_agent.tools.

    Returns:
        List[Message]: The conversation response.
    """

    if not chat:
        raise ValueError("chat cannot be empty")

    if not artifacts:
        artifacts = Artifacts(self.cwd)

    with CodeInterpreterFactory.new_instance(
        code_sandbox_runtime=self.code_sandbox_runtime,
        remote_path=self.cwd,
    ) as code_interpreter:
        orig_chat = copy.deepcopy(chat)
        int_chat = copy.deepcopy(chat)
        last_user_message = chat[-1]
        for chat_i in int_chat:
            if "media" in chat_i:
                for media in chat_i["media"]:
                    media = cast(str, media)
                    media_remote_path = Path(artifacts.cwd) / Path(media).name
                    chat_i["content"] += f" Media name {media_remote_path}"  # type: ignore

        int_chat = cast(
            List[Message],
            [
                (
                    {
                        "role": c["role"],
                        "content": old_format_to_new_format(c["content"]),  # type: ignore
                        "media": c["media"],
                    }
                    if "media" in c
                    else {"role": c["role"], "content": old_format_to_new_format(c["content"])}  # type: ignore
                )
                for c in int_chat
            ],
        )

        finished = False
        iterations = 0
        last_response = None

        # Upload artifacts to remote location and show where they are going
        # to be loaded to. The actual loading happens in BoilerplateCode as
        # part of the pre_code.
        artifacts_loaded = artifacts.show()
        int_chat.append({"role": "observation", "content": artifacts_loaded})
        orig_chat.append({"role": "observation", "content": artifacts_loaded})
        self.streaming_message({"role": "observation", "content": artifacts_loaded})

        user_result, user_obs = execute_user_code_action(
            artifacts,
            last_user_message,
            code_interpreter,
        )
        finished = user_result is not None and user_obs is not None
        if user_result is not None and user_obs is not None:
            # be sure to update the chat with user execution results
            chat_elt: Message = {"role": "observation", "content": user_obs}
            int_chat.append(chat_elt)
            chat_elt["execution"] = user_result
            orig_chat.append(chat_elt)
            self.streaming_message(
                {
                    "role": "observation",
                    "content": user_obs,
                    "execution": user_result,
                    "finished": finished,
                }
            )

        while not finished and iterations < self.max_iterations:
            response = run_conversation(self.agent, int_chat)
            if self.verbosity >= 1:
                _LOGGER.info(response)

            code_action = response.get("execute_python", None)
            # sometimes it gets stuck in a loop, so we force it to exit
            if last_response == response:
                response["let_user_respond"] = True
                self.streaming_message(
                    {
                        "role": "assistant",
                        "content": "{}",
                        "error": {
                            "name": "Error when running conversation agent",
                            "value": "Agent is stuck in conversation loop, exited",
                            "traceback_raw": [],
                        },
                        "finished": True,
                    }
                )
            else:
                self.streaming_message(
                    {
                        "role": "assistant",
                        "content": new_format_to_old_format(
                            add_step_descriptions(response)
                        ),
                        "finished": response.get("let_user_respond", False)
                        and code_action is None,
                    }
                )

            int_chat.append(
                {
                    "role": "assistant",
                    "content": json.dumps(
                        new_format_to_old_format(add_step_descriptions(response))
                    ),
                }
            )
            orig_chat.append(
                {
                    "role": "assistant",
                    "content": json.dumps(
                        new_format_to_old_format(add_step_descriptions(response))
                    ),
                }
            )
            finished = response.get("let_user_respond", False)

            if code_action is not None:
                code_action = use_extra_vision_agent_args(
                    code_action, test_multi_plan, custom_tool_names
                )

            if code_action is not None:
                result, obs = execute_code_action(
                    artifacts,
                    code_action,
                    code_interpreter,
                )
                obs_chat_elt: Message = {"role": "observation", "content": obs}
                media_obs = check_and_load_image(code_action)
                if media_obs and result.success:
                    obs_chat_elt["media"] = [
                        artifacts.cwd / media_ob for media_ob in media_obs
                    ]

                if self.verbosity >= 1:
                    _LOGGER.info(obs)

                # don't add execution results to internal chat
                int_chat.append(obs_chat_elt)
                obs_chat_elt["execution"] = result
                orig_chat.append(obs_chat_elt)
                self.streaming_message(
                    {
                        "role": "observation",
                        "content": obs,
                        "execution": result,
                        "finished": finished,
                    }
                )

            iterations += 1
            last_response = response

    return orig_chat, artifacts

streaming_message

streaming_message(message)
Source code in vision_agent/agent/vision_agent.py
def streaming_message(self, message: Dict[str, Any]) -> None:
    if self.callback_message:
        self.callback_message(message)

log_progress

log_progress(data)
Source code in vision_agent/agent/vision_agent.py
def log_progress(self, data: Dict[str, Any]) -> None:
    pass

vision_agent.agent.vision_agent_coder.VisionAgentCoder

VisionAgentCoder(
    planner=None,
    coder=None,
    tester=None,
    debugger=None,
    verbosity=0,
    report_progress_callback=None,
    code_interpreter=None,
)

Bases: Agent

Vision Agent Coder is an agentic framework that can output code based on a user request. It can plan tasks, retrieve relevant tools, write code, write tests and reflect on failed test cases to debug code. It is inspired by AgentCoder https://arxiv.org/abs/2312.13010 and Data Interpeter https://arxiv.org/abs/2402.18679

Example

>>> import vision_agent as va
>>> agent = va.agent.VisionAgentCoder()
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")

Initialize the Vision Agent Coder.

PARAMETER DESCRIPTION
planner

The planner model to use. Defaults to AnthropicVisionAgentPlanner.

TYPE: Optional[Agent] DEFAULT: None

coder

The coder model to use. Defaults to AnthropicLMM.

TYPE: Optional[LMM] DEFAULT: None

tester

The tester model to use. Defaults to AnthropicLMM.

TYPE: Optional[LMM] DEFAULT: None

debugger

The debugger model to use. Defaults to AnthropicLMM.

TYPE: Optional[LMM] DEFAULT: None

verbosity

The verbosity level of the agent. Defaults to 0. 2 is the highest verbosity level which will output all intermediate debugging code.

TYPE: int DEFAULT: 0

report_progress_callback

a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgentCoder instances are running in parallel. This callback ensures that the progress are not mixed up.

TYPE: Optional[Callable[Dict[str, Any]]] DEFAULT: None

code_interpreter

For string values it can be one of: None, "local" or "e2b". If None, it will read from the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter object is provided it will use that.

TYPE: Optional[Union[str, CodeInterpreter]] DEFAULT: None

Source code in vision_agent/agent/vision_agent_coder.py
def __init__(
    self,
    planner: Optional[Agent] = None,
    coder: Optional[LMM] = None,
    tester: Optional[LMM] = None,
    debugger: Optional[LMM] = None,
    verbosity: int = 0,
    report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
    code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
) -> None:
    """Initialize the Vision Agent Coder.

    Parameters:
        planner (Optional[Agent]): The planner model to use. Defaults to
            AnthropicVisionAgentPlanner.
        coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
        tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
        debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
        verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
            highest verbosity level which will output all intermediate debugging
            code.
        report_progress_callback (Optional[Callable[Dict[str, Any]]]): a callback
            to report the progress of the agent. This is useful for streaming logs
            in a web application where multiple VisionAgentCoder instances are
            running in parallel. This callback ensures that the progress are not
            mixed up.
        code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
            it can be one of: None, "local" or "e2b". If None, it will read from
            the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
            object is provided it will use that.
    """

    self.planner = (
        AnthropicVisionAgentPlanner(verbosity=verbosity)
        if planner is None
        else planner
    )
    self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
    self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
    self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
    self.verbosity = verbosity
    if self.verbosity > 0:
        _LOGGER.setLevel(logging.INFO)

    self.report_progress_callback = report_progress_callback
    self.code_interpreter = code_interpreter

planner instance-attribute

planner = (
    AnthropicVisionAgentPlanner(verbosity=verbosity)
    if planner is None
    else planner
)

coder instance-attribute

coder = (
    AnthropicLMM(temperature=0.0)
    if coder is None
    else coder
)

tester instance-attribute

tester = (
    AnthropicLMM(temperature=0.0)
    if tester is None
    else tester
)

debugger instance-attribute

debugger = (
    AnthropicLMM(temperature=0.0)
    if debugger is None
    else debugger
)

verbosity instance-attribute

verbosity = verbosity

report_progress_callback instance-attribute

report_progress_callback = report_progress_callback

code_interpreter instance-attribute

code_interpreter = code_interpreter

generate_code_from_plan

generate_code_from_plan(
    chat, plan_context, code_interpreter=None
)

Generates code and other intermediate outputs from a chat input and a plan. The plan includes: - plans: The plans generated by the planner. - best_plan: The best plan selected by the planner. - plan_thoughts: The thoughts of the planner, including any modifications to the plan. - tool_doc: The tool documentation for the best plan. - tool_output: The tool output from the tools used by the best plan.

PARAMETER DESCRIPTION
chat

A conversation in the format of [{"role": "user", "content": "describe your task here..."}].

TYPE: List[Message]

plan_context

The context of the plan, including the plans, best_plan, plan_thoughts, tool_doc, and tool_output.

TYPE: PlanContext

RETURNS DESCRIPTION
Dict[str, Any]

Dict[str, Any]: A dictionary containing the code output by the VisionAgentCoder and other intermediate outputs. include: - status (str): Whether or not the agent completed or failed generating the code. - code (str): The code output by the VisionAgentCoder. - test (str): The test output by the VisionAgentCoder. - test_result (Execution): The result of the test execution. - plans (Dict[str, Any]): The plans generated by the planner. - plan_thoughts (str): The thoughts of the planner. - working_memory (List[Dict[str, str]]): The working memory of the agent.

Source code in vision_agent/agent/vision_agent_coder.py
def generate_code_from_plan(
    self,
    chat: List[Message],
    plan_context: PlanContext,
    code_interpreter: Optional[CodeInterpreter] = None,
) -> Dict[str, Any]:
    """Generates code and other intermediate outputs from a chat input and a plan.
    The plan includes:
        - plans: The plans generated by the planner.
        - best_plan: The best plan selected by the planner.
        - plan_thoughts: The thoughts of the planner, including any modifications
            to the plan.
        - tool_doc: The tool documentation for the best plan.
        - tool_output: The tool output from the tools used by the best plan.

    Parameters:
        chat (List[Message]): A conversation in the format of
            [{"role": "user", "content": "describe your task here..."}].
        plan_context (PlanContext): The context of the plan, including the plans,
            best_plan, plan_thoughts, tool_doc, and tool_output.

    Returns:
        Dict[str, Any]: A dictionary containing the code output by the
            VisionAgentCoder and other intermediate outputs. include:
            - status (str): Whether or not the agent completed or failed generating
                the code.
            - code (str): The code output by the VisionAgentCoder.
            - test (str): The test output by the VisionAgentCoder.
            - test_result (Execution): The result of the test execution.
            - plans (Dict[str, Any]): The plans generated by the planner.
            - plan_thoughts (str): The thoughts of the planner.
            - working_memory (List[Dict[str, str]]): The working memory of the agent.
    """
    if not chat:
        raise ValueError("Chat cannot be empty.")

    # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
    code_interpreter = (
        self.code_interpreter
        if self.code_interpreter is not None
        and not isinstance(self.code_interpreter, str)
        else CodeInterpreterFactory.new_instance(
            code_sandbox_runtime=self.code_interpreter,
        )
    )
    with code_interpreter:
        chat = copy.deepcopy(chat)
        media_list = []
        for chat_i in chat:
            if "media" in chat_i:
                for media in chat_i["media"]:
                    chat_i["content"] += f" Media name {media}"  # type: ignore
                    media_list.append(str(media))

        int_chat = cast(
            List[Message],
            [
                (
                    {
                        "role": c["role"],
                        "content": c["content"],
                        "media": c["media"],
                    }
                    if "media" in c
                    else {"role": c["role"], "content": c["content"]}
                )
                for c in chat
            ],
        )

        code = ""
        test = ""
        working_memory: List[Dict[str, str]] = []
        plan = plan_context.plans[plan_context.best_plan]
        tool_doc = plan_context.tool_doc
        tool_output_str = plan_context.tool_output
        plan_thoughts_str = str(plan_context.plan_thoughts)

        if self.verbosity >= 1:
            plan_fixed = [{"instructions": e} for e in plan["instructions"]]
            _LOGGER.info(
                f"Picked best plan:\n{tabulate(tabular_data=plan_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
            )

        results = write_and_test_code(
            chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
            plan=f"\n{plan['thoughts']}\n-"
            + "\n-".join([e for e in plan["instructions"]]),
            tool_info=tool_doc,
            tool_output=tool_output_str,
            plan_thoughts=plan_thoughts_str,
            tool_utils=T.UTILITIES_DOCSTRING,
            working_memory=working_memory,
            coder=self.coder,
            tester=self.tester,
            debugger=self.debugger,
            code_interpreter=code_interpreter,
            log_progress=self.log_progress,
            verbosity=self.verbosity,
            media=media_list,
        )
        success = cast(bool, results["success"])
        code = remove_installs_from_code(cast(str, results["code"]))
        test = remove_installs_from_code(cast(str, results["test"]))
        working_memory.extend(results["working_memory"])
        execution_result = cast(Execution, results["test_result"])

        return {
            "status": "completed" if success else "failed",
            "code": DefaultImports.prepend_imports(code),
            "test": test,
            "test_result": execution_result,
            "plans": plan_context.plans,
            "plan_thoughts": plan_thoughts_str,
            "working_memory": working_memory,
        }

generate_code

generate_code(
    chat, test_multi_plan=True, custom_tool_names=None
)

Generates code and other intermediate outputs from a chat input.

PARAMETER DESCRIPTION
chat

A conversation in the format of [{"role": "user", "content": "describe your task here..."}].

TYPE: List[Message]

test_multi_plan

Whether to test multiple plans or just the best plan.

TYPE: bool DEFAULT: True

custom_tool_names

A list of custom tool names to use for the planner.

TYPE: Optional[List[str]] DEFAULT: None

RETURNS DESCRIPTION
Dict[str, Any]

Dict[str, Any]: A dictionary containing the code output by the VisionAgentCoder and other intermediate outputs. include: - status (str): Whether or not the agent completed or failed generating the code. - code (str): The code output by the VisionAgentCoder. - test (str): The test output by the VisionAgentCoder. - test_result (Execution): The result of the test execution. - plans (Dict[str, Any]): The plans generated by the planner. - plan_thoughts (str): The thoughts of the planner. - working_memory (List[Dict[str, str]]): The working memory of the agent.

Source code in vision_agent/agent/vision_agent_coder.py
def generate_code(
    self,
    chat: List[Message],
    test_multi_plan: bool = True,
    custom_tool_names: Optional[List[str]] = None,
) -> Dict[str, Any]:
    """Generates code and other intermediate outputs from a chat input.

    Parameters:
        chat (List[Message]): A conversation in the format of
            [{"role": "user", "content": "describe your task here..."}].
        test_multi_plan (bool): Whether to test multiple plans or just the best plan.
        custom_tool_names (Optional[List[str]]): A list of custom tool names to use
            for the planner.

    Returns:
        Dict[str, Any]: A dictionary containing the code output by the
            VisionAgentCoder and other intermediate outputs. include:
            - status (str): Whether or not the agent completed or failed generating
                the code.
            - code (str): The code output by the VisionAgentCoder.
            - test (str): The test output by the VisionAgentCoder.
            - test_result (Execution): The result of the test execution.
            - plans (Dict[str, Any]): The plans generated by the planner.
            - plan_thoughts (str): The thoughts of the planner.
            - working_memory (List[Dict[str, str]]): The working memory of the agent.
    """
    if not chat:
        raise ValueError("Chat cannot be empty.")

    # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
    code_interpreter = (
        self.code_interpreter
        if self.code_interpreter is not None
        and not isinstance(self.code_interpreter, str)
        else CodeInterpreterFactory.new_instance(
            code_sandbox_runtime=self.code_interpreter,
        )
    )
    with code_interpreter:
        plan_context = self.planner.generate_plan(  # type: ignore
            chat,
            test_multi_plan=test_multi_plan,
            custom_tool_names=custom_tool_names,
            code_interpreter=code_interpreter,
        )

        code_and_context = self.generate_code_from_plan(
            chat,
            plan_context,
            code_interpreter=code_interpreter,
        )
    return code_and_context

chat

chat(chat)
Source code in vision_agent/agent/vision_agent_coder.py
def chat(self, chat: List[Message]) -> List[Message]:
    chat = copy.deepcopy(chat)
    code = self.generate_code(chat)
    chat.append({"role": "agent", "content": code["code"]})
    return chat

log_progress

log_progress(data)
Source code in vision_agent/agent/vision_agent_coder.py
def log_progress(self, data: Dict[str, Any]) -> None:
    if self.report_progress_callback is not None:
        self.report_progress_callback(data)

vision_agent.agent.vision_agent_coder.AzureVisionAgentCoder

AzureVisionAgentCoder(
    planner=None,
    coder=None,
    tester=None,
    debugger=None,
    verbosity=0,
    report_progress_callback=None,
    code_interpreter=None,
)

Bases: VisionAgentCoder

VisionAgentCoder that uses Azure OpenAI APIs for planning, coding, testing.

Pre-requisites: 1. Set the environment variable AZURE_OPENAI_API_KEY to your Azure OpenAI API key. 2. Set the environment variable AZURE_OPENAI_ENDPOINT to your Azure OpenAI endpoint.

Example

>>> import vision_agent as va
>>> agent = va.agent.AzureVisionAgentCoder()
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")

Initialize the Vision Agent Coder.

PARAMETER DESCRIPTION
planner

The planner model to use. Defaults to AzureVisionAgentPlanner.

TYPE: Optional[Agent] DEFAULT: None

coder

The coder model to use. Defaults to OpenAILMM.

TYPE: Optional[LMM] DEFAULT: None

tester

The tester model to use. Defaults to OpenAILMM.

TYPE: Optional[LMM] DEFAULT: None

debugger

The debugger model to

TYPE: Optional[LMM] DEFAULT: None

verbosity

The verbosity level of the agent. Defaults to 0. 2 is the highest verbosity level which will output all intermediate debugging code.

TYPE: int DEFAULT: 0

report_progress_callback

a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgentCoder instances are running in parallel. This callback ensures that the progress are not mixed up.

TYPE: Optional[Callable[[Dict[str, Any]], None]] DEFAULT: None

Source code in vision_agent/agent/vision_agent_coder.py
def __init__(
    self,
    planner: Optional[Agent] = None,
    coder: Optional[LMM] = None,
    tester: Optional[LMM] = None,
    debugger: Optional[LMM] = None,
    verbosity: int = 0,
    report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
    code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
) -> None:
    """Initialize the Vision Agent Coder.

    Parameters:
        planner (Optional[Agent]): The planner model to use. Defaults to
            AzureVisionAgentPlanner.
        coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
        tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
        debugger (Optional[LMM]): The debugger model to
        verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
            highest verbosity level which will output all intermediate debugging
            code.
        report_progress_callback: a callback to report the progress of the agent.
            This is useful for streaming logs in a web application where multiple
            VisionAgentCoder instances are running in parallel. This callback
            ensures that the progress are not mixed up.
    """
    super().__init__(
        planner=(
            AzureVisionAgentPlanner(verbosity=verbosity)
            if planner is None
            else planner
        ),
        coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder,
        tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester,
        debugger=(
            AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
        ),
        verbosity=verbosity,
        report_progress_callback=report_progress_callback,
        code_interpreter=code_interpreter,
    )

planner instance-attribute

planner = (
    AnthropicVisionAgentPlanner(verbosity=verbosity)
    if planner is None
    else planner
)

coder instance-attribute

coder = (
    AnthropicLMM(temperature=0.0)
    if coder is None
    else coder
)

tester instance-attribute

tester = (
    AnthropicLMM(temperature=0.0)
    if tester is None
    else tester
)

debugger instance-attribute

debugger = (
    AnthropicLMM(temperature=0.0)
    if debugger is None
    else debugger
)

verbosity instance-attribute

verbosity = verbosity

report_progress_callback instance-attribute

report_progress_callback = report_progress_callback

code_interpreter instance-attribute

code_interpreter = code_interpreter

log_progress

log_progress(data)
Source code in vision_agent/agent/vision_agent_coder.py
def log_progress(self, data: Dict[str, Any]) -> None:
    if self.report_progress_callback is not None:
        self.report_progress_callback(data)

generate_code_from_plan

generate_code_from_plan(
    chat, plan_context, code_interpreter=None
)

Generates code and other intermediate outputs from a chat input and a plan. The plan includes: - plans: The plans generated by the planner. - best_plan: The best plan selected by the planner. - plan_thoughts: The thoughts of the planner, including any modifications to the plan. - tool_doc: The tool documentation for the best plan. - tool_output: The tool output from the tools used by the best plan.

PARAMETER DESCRIPTION
chat

A conversation in the format of [{"role": "user", "content": "describe your task here..."}].

TYPE: List[Message]

plan_context

The context of the plan, including the plans, best_plan, plan_thoughts, tool_doc, and tool_output.

TYPE: PlanContext

RETURNS DESCRIPTION
Dict[str, Any]

Dict[str, Any]: A dictionary containing the code output by the VisionAgentCoder and other intermediate outputs. include: - status (str): Whether or not the agent completed or failed generating the code. - code (str): The code output by the VisionAgentCoder. - test (str): The test output by the VisionAgentCoder. - test_result (Execution): The result of the test execution. - plans (Dict[str, Any]): The plans generated by the planner. - plan_thoughts (str): The thoughts of the planner. - working_memory (List[Dict[str, str]]): The working memory of the agent.

Source code in vision_agent/agent/vision_agent_coder.py
def generate_code_from_plan(
    self,
    chat: List[Message],
    plan_context: PlanContext,
    code_interpreter: Optional[CodeInterpreter] = None,
) -> Dict[str, Any]:
    """Generates code and other intermediate outputs from a chat input and a plan.
    The plan includes:
        - plans: The plans generated by the planner.
        - best_plan: The best plan selected by the planner.
        - plan_thoughts: The thoughts of the planner, including any modifications
            to the plan.
        - tool_doc: The tool documentation for the best plan.
        - tool_output: The tool output from the tools used by the best plan.

    Parameters:
        chat (List[Message]): A conversation in the format of
            [{"role": "user", "content": "describe your task here..."}].
        plan_context (PlanContext): The context of the plan, including the plans,
            best_plan, plan_thoughts, tool_doc, and tool_output.

    Returns:
        Dict[str, Any]: A dictionary containing the code output by the
            VisionAgentCoder and other intermediate outputs. include:
            - status (str): Whether or not the agent completed or failed generating
                the code.
            - code (str): The code output by the VisionAgentCoder.
            - test (str): The test output by the VisionAgentCoder.
            - test_result (Execution): The result of the test execution.
            - plans (Dict[str, Any]): The plans generated by the planner.
            - plan_thoughts (str): The thoughts of the planner.
            - working_memory (List[Dict[str, str]]): The working memory of the agent.
    """
    if not chat:
        raise ValueError("Chat cannot be empty.")

    # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
    code_interpreter = (
        self.code_interpreter
        if self.code_interpreter is not None
        and not isinstance(self.code_interpreter, str)
        else CodeInterpreterFactory.new_instance(
            code_sandbox_runtime=self.code_interpreter,
        )
    )
    with code_interpreter:
        chat = copy.deepcopy(chat)
        media_list = []
        for chat_i in chat:
            if "media" in chat_i:
                for media in chat_i["media"]:
                    chat_i["content"] += f" Media name {media}"  # type: ignore
                    media_list.append(str(media))

        int_chat = cast(
            List[Message],
            [
                (
                    {
                        "role": c["role"],
                        "content": c["content"],
                        "media": c["media"],
                    }
                    if "media" in c
                    else {"role": c["role"], "content": c["content"]}
                )
                for c in chat
            ],
        )

        code = ""
        test = ""
        working_memory: List[Dict[str, str]] = []
        plan = plan_context.plans[plan_context.best_plan]
        tool_doc = plan_context.tool_doc
        tool_output_str = plan_context.tool_output
        plan_thoughts_str = str(plan_context.plan_thoughts)

        if self.verbosity >= 1:
            plan_fixed = [{"instructions": e} for e in plan["instructions"]]
            _LOGGER.info(
                f"Picked best plan:\n{tabulate(tabular_data=plan_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
            )

        results = write_and_test_code(
            chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
            plan=f"\n{plan['thoughts']}\n-"
            + "\n-".join([e for e in plan["instructions"]]),
            tool_info=tool_doc,
            tool_output=tool_output_str,
            plan_thoughts=plan_thoughts_str,
            tool_utils=T.UTILITIES_DOCSTRING,
            working_memory=working_memory,
            coder=self.coder,
            tester=self.tester,
            debugger=self.debugger,
            code_interpreter=code_interpreter,
            log_progress=self.log_progress,
            verbosity=self.verbosity,
            media=media_list,
        )
        success = cast(bool, results["success"])
        code = remove_installs_from_code(cast(str, results["code"]))
        test = remove_installs_from_code(cast(str, results["test"]))
        working_memory.extend(results["working_memory"])
        execution_result = cast(Execution, results["test_result"])

        return {
            "status": "completed" if success else "failed",
            "code": DefaultImports.prepend_imports(code),
            "test": test,
            "test_result": execution_result,
            "plans": plan_context.plans,
            "plan_thoughts": plan_thoughts_str,
            "working_memory": working_memory,
        }

generate_code

generate_code(
    chat, test_multi_plan=True, custom_tool_names=None
)

Generates code and other intermediate outputs from a chat input.

PARAMETER DESCRIPTION
chat

A conversation in the format of [{"role": "user", "content": "describe your task here..."}].

TYPE: List[Message]

test_multi_plan

Whether to test multiple plans or just the best plan.

TYPE: bool DEFAULT: True

custom_tool_names

A list of custom tool names to use for the planner.

TYPE: Optional[List[str]] DEFAULT: None

RETURNS DESCRIPTION
Dict[str, Any]

Dict[str, Any]: A dictionary containing the code output by the VisionAgentCoder and other intermediate outputs. include: - status (str): Whether or not the agent completed or failed generating the code. - code (str): The code output by the VisionAgentCoder. - test (str): The test output by the VisionAgentCoder. - test_result (Execution): The result of the test execution. - plans (Dict[str, Any]): The plans generated by the planner. - plan_thoughts (str): The thoughts of the planner. - working_memory (List[Dict[str, str]]): The working memory of the agent.

Source code in vision_agent/agent/vision_agent_coder.py
def generate_code(
    self,
    chat: List[Message],
    test_multi_plan: bool = True,
    custom_tool_names: Optional[List[str]] = None,
) -> Dict[str, Any]:
    """Generates code and other intermediate outputs from a chat input.

    Parameters:
        chat (List[Message]): A conversation in the format of
            [{"role": "user", "content": "describe your task here..."}].
        test_multi_plan (bool): Whether to test multiple plans or just the best plan.
        custom_tool_names (Optional[List[str]]): A list of custom tool names to use
            for the planner.

    Returns:
        Dict[str, Any]: A dictionary containing the code output by the
            VisionAgentCoder and other intermediate outputs. include:
            - status (str): Whether or not the agent completed or failed generating
                the code.
            - code (str): The code output by the VisionAgentCoder.
            - test (str): The test output by the VisionAgentCoder.
            - test_result (Execution): The result of the test execution.
            - plans (Dict[str, Any]): The plans generated by the planner.
            - plan_thoughts (str): The thoughts of the planner.
            - working_memory (List[Dict[str, str]]): The working memory of the agent.
    """
    if not chat:
        raise ValueError("Chat cannot be empty.")

    # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
    code_interpreter = (
        self.code_interpreter
        if self.code_interpreter is not None
        and not isinstance(self.code_interpreter, str)
        else CodeInterpreterFactory.new_instance(
            code_sandbox_runtime=self.code_interpreter,
        )
    )
    with code_interpreter:
        plan_context = self.planner.generate_plan(  # type: ignore
            chat,
            test_multi_plan=test_multi_plan,
            custom_tool_names=custom_tool_names,
            code_interpreter=code_interpreter,
        )

        code_and_context = self.generate_code_from_plan(
            chat,
            plan_context,
            code_interpreter=code_interpreter,
        )
    return code_and_context

chat

chat(chat)
Source code in vision_agent/agent/vision_agent_coder.py
def chat(self, chat: List[Message]) -> List[Message]:
    chat = copy.deepcopy(chat)
    code = self.generate_code(chat)
    chat.append({"role": "agent", "content": code["code"]})
    return chat