Skip to content

vision_agent.lmm

vision_agent.lmm.OpenAILMM

OpenAILMM(
    model_name="gpt-4o-2024-05-13",
    api_key=None,
    max_tokens=4096,
    json_mode=False,
    image_size=768,
    image_detail="low",
    **kwargs
)

Bases: LMM

An LMM class for the OpenAI LMMs.

Source code in vision_agent/lmm/lmm.py
def __init__(
    self,
    model_name: str = "gpt-4o-2024-05-13",
    api_key: Optional[str] = None,
    max_tokens: int = 4096,
    json_mode: bool = False,
    image_size: int = 768,
    image_detail: str = "low",
    **kwargs: Any,
):
    if not api_key:
        self.client = OpenAI()
    else:
        self.client = OpenAI(api_key=api_key)

    self.client = OpenAI(api_key=api_key)
    self.model_name = model_name
    self.image_size = image_size
    self.image_detail = image_detail
    # o1 does not use max_tokens
    if "max_tokens" not in kwargs and not model_name.startswith("o1"):
        kwargs["max_tokens"] = max_tokens
    if json_mode:
        kwargs["response_format"] = {"type": "json_object"}
    self.kwargs = kwargs

client instance-attribute

client = OpenAI(api_key=api_key)

model_name instance-attribute

model_name = model_name

image_size instance-attribute

image_size = image_size

image_detail instance-attribute

image_detail = image_detail

kwargs instance-attribute

kwargs = kwargs

chat

chat(chat, **kwargs)

Chat with the LMM model.

PARAMETER DESCRIPTION
chat

A list of dictionaries containing the chat messages. The messages can be in the format: [{"role": "user", "content": "Hello!"}, ...] or if it contains media, it should be in the format: [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]

TYPE: Squence[Dict[str, str]]

Source code in vision_agent/lmm/lmm.py
def chat(
    self,
    chat: Sequence[Message],
    **kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
    """Chat with the LMM model.

    Parameters:
        chat (Squence[Dict[str, str]]): A list of dictionaries containing the chat
            messages. The messages can be in the format:
            [{"role": "user", "content": "Hello!"}, ...]
            or if it contains media, it should be in the format:
            [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
    """
    fixed_chat = []
    for c in chat:
        fixed_c = {"role": c["role"]}
        fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
        if "media" in c:
            for media in c["media"]:
                resize = kwargs["resize"] if "resize" in kwargs else self.image_size
                image_detail = (
                    kwargs["image_detail"]
                    if "image_detail" in kwargs
                    else self.image_detail
                )
                encoded_media = encode_media(cast(str, media), resize=resize)

                fixed_c["content"].append(  # type: ignore
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": (
                                encoded_media
                                if encoded_media.startswith(("http", "https"))
                                or encoded_media.startswith("data:image/")
                                else f"data:image/png;base64,{encoded_media}"
                            ),
                            "detail": image_detail,
                        },
                    },
                )
        fixed_chat.append(fixed_c)

    # prefers kwargs from second dictionary over first
    tmp_kwargs = self.kwargs | kwargs
    response = self.client.chat.completions.create(
        model=self.model_name, messages=fixed_chat, **tmp_kwargs  # type: ignore
    )
    if "stream" in tmp_kwargs and tmp_kwargs["stream"]:

        def f() -> Iterator[Optional[str]]:
            for chunk in response:
                chunk_message = chunk.choices[0].delta.content  # type: ignore
                yield chunk_message

        return f()
    else:
        return cast(str, response.choices[0].message.content)

generate

generate(prompt, media=None, **kwargs)
Source code in vision_agent/lmm/lmm.py
def generate(
    self,
    prompt: str,
    media: Optional[Sequence[Union[str, Path]]] = None,
    **kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
    message: List[Dict[str, Any]] = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
            ],
        }
    ]
    if media and len(media) > 0:
        for m in media:
            resize = kwargs["resize"] if "resize" in kwargs else None
            image_detail = (
                kwargs["image_detail"]
                if "image_detail" in kwargs
                else self.image_detail
            )
            encoded_media = encode_media(m, resize=resize)
            message[0]["content"].append(
                {
                    "type": "image_url",
                    "image_url": {
                        "url": (
                            encoded_media
                            if encoded_media.startswith(("http", "https"))
                            or encoded_media.startswith("data:image/")
                            else f"data:image/png;base64,{encoded_media}"
                        ),
                        "detail": image_detail,
                    },
                },
            )

    # prefers kwargs from second dictionary over first
    tmp_kwargs = self.kwargs | kwargs
    response = self.client.chat.completions.create(
        model=self.model_name, messages=message, **tmp_kwargs  # type: ignore
    )
    if "stream" in tmp_kwargs and tmp_kwargs["stream"]:

        def f() -> Iterator[Optional[str]]:
            for chunk in response:
                chunk_message = chunk.choices[0].delta.content  # type: ignore
                yield chunk_message

        return f()
    else:
        return cast(str, response.choices[0].message.content)

vision_agent.lmm.AzureOpenAILMM

AzureOpenAILMM(
    model_name=None,
    api_key=None,
    api_version="2024-02-01",
    azure_endpoint=None,
    max_tokens=4096,
    json_mode=False,
    image_detail="low",
    **kwargs
)

Bases: OpenAILMM

Source code in vision_agent/lmm/lmm.py
def __init__(
    self,
    model_name: Optional[str] = None,
    api_key: Optional[str] = None,
    api_version: str = "2024-02-01",
    azure_endpoint: Optional[str] = None,
    max_tokens: int = 4096,
    json_mode: bool = False,
    image_detail: str = "low",
    **kwargs: Any,
):
    if not api_key:
        api_key = os.getenv("AZURE_OPENAI_API_KEY")
    if not azure_endpoint:
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    if not model_name:
        model_name = os.getenv("AZURE_OPENAI_CHAT_MODEL_DEPLOYMENT_NAME")

    if not api_key:
        raise ValueError("OpenAI API key is required.")
    if not azure_endpoint:
        raise ValueError("Azure OpenAI endpoint is required.")
    if not model_name:
        raise ValueError("Azure OpenAI chat model deployment name is required.")

    self.client = AzureOpenAI(
        api_key=api_key,
        api_version=api_version,
        azure_endpoint=azure_endpoint,
    )
    self.model_name = model_name
    self.image_detail = image_detail

    if "max_tokens" not in kwargs:
        kwargs["max_tokens"] = max_tokens
    if json_mode:
        kwargs["response_format"] = {"type": "json_object"}
    self.kwargs = kwargs

image_size instance-attribute

image_size = image_size

client instance-attribute

client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint=azure_endpoint,
)

model_name instance-attribute

model_name = model_name

image_detail instance-attribute

image_detail = image_detail

kwargs instance-attribute

kwargs = kwargs

generate

generate(prompt, media=None, **kwargs)
Source code in vision_agent/lmm/lmm.py
def generate(
    self,
    prompt: str,
    media: Optional[Sequence[Union[str, Path]]] = None,
    **kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
    message: List[Dict[str, Any]] = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
            ],
        }
    ]
    if media and len(media) > 0:
        for m in media:
            resize = kwargs["resize"] if "resize" in kwargs else None
            image_detail = (
                kwargs["image_detail"]
                if "image_detail" in kwargs
                else self.image_detail
            )
            encoded_media = encode_media(m, resize=resize)
            message[0]["content"].append(
                {
                    "type": "image_url",
                    "image_url": {
                        "url": (
                            encoded_media
                            if encoded_media.startswith(("http", "https"))
                            or encoded_media.startswith("data:image/")
                            else f"data:image/png;base64,{encoded_media}"
                        ),
                        "detail": image_detail,
                    },
                },
            )

    # prefers kwargs from second dictionary over first
    tmp_kwargs = self.kwargs | kwargs
    response = self.client.chat.completions.create(
        model=self.model_name, messages=message, **tmp_kwargs  # type: ignore
    )
    if "stream" in tmp_kwargs and tmp_kwargs["stream"]:

        def f() -> Iterator[Optional[str]]:
            for chunk in response:
                chunk_message = chunk.choices[0].delta.content  # type: ignore
                yield chunk_message

        return f()
    else:
        return cast(str, response.choices[0].message.content)

chat

chat(chat, **kwargs)

Chat with the LMM model.

PARAMETER DESCRIPTION
chat

A list of dictionaries containing the chat messages. The messages can be in the format: [{"role": "user", "content": "Hello!"}, ...] or if it contains media, it should be in the format: [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]

TYPE: Squence[Dict[str, str]]

Source code in vision_agent/lmm/lmm.py
def chat(
    self,
    chat: Sequence[Message],
    **kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
    """Chat with the LMM model.

    Parameters:
        chat (Squence[Dict[str, str]]): A list of dictionaries containing the chat
            messages. The messages can be in the format:
            [{"role": "user", "content": "Hello!"}, ...]
            or if it contains media, it should be in the format:
            [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
    """
    fixed_chat = []
    for c in chat:
        fixed_c = {"role": c["role"]}
        fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
        if "media" in c:
            for media in c["media"]:
                resize = kwargs["resize"] if "resize" in kwargs else self.image_size
                image_detail = (
                    kwargs["image_detail"]
                    if "image_detail" in kwargs
                    else self.image_detail
                )
                encoded_media = encode_media(cast(str, media), resize=resize)

                fixed_c["content"].append(  # type: ignore
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": (
                                encoded_media
                                if encoded_media.startswith(("http", "https"))
                                or encoded_media.startswith("data:image/")
                                else f"data:image/png;base64,{encoded_media}"
                            ),
                            "detail": image_detail,
                        },
                    },
                )
        fixed_chat.append(fixed_c)

    # prefers kwargs from second dictionary over first
    tmp_kwargs = self.kwargs | kwargs
    response = self.client.chat.completions.create(
        model=self.model_name, messages=fixed_chat, **tmp_kwargs  # type: ignore
    )
    if "stream" in tmp_kwargs and tmp_kwargs["stream"]:

        def f() -> Iterator[Optional[str]]:
            for chunk in response:
                chunk_message = chunk.choices[0].delta.content  # type: ignore
                yield chunk_message

        return f()
    else:
        return cast(str, response.choices[0].message.content)

vision_agent.lmm.OllamaLMM

OllamaLMM(
    model_name="llava",
    base_url="http://localhost:11434/api",
    json_mode=False,
    num_ctx=128000,
    image_size=768,
    **kwargs
)

Bases: LMM

An LMM class for the ollama.

Initializes the Ollama LMM. kwargs are passed as 'options' to the model. More information on options can be found here https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values

PARAMETER DESCRIPTION
model_name

The ollama name of the model.

TYPE: str DEFAULT: 'llava'

base_url

The base URL of the Ollama API.

TYPE: str DEFAULT: 'http://localhost:11434/api'

json_mode

Whether to use JSON mode.

TYPE: bool DEFAULT: False

num_ctx

The context length for the model.

TYPE: int DEFAULT: 128000

kwargs

Additional options to pass to the model.

TYPE: Any DEFAULT: {}

Source code in vision_agent/lmm/lmm.py
def __init__(
    self,
    model_name: str = "llava",
    base_url: Optional[str] = "http://localhost:11434/api",
    json_mode: bool = False,
    num_ctx: int = 128_000,
    image_size: int = 768,
    **kwargs: Any,
):
    """Initializes the Ollama LMM. kwargs are passed as 'options' to the model.
    More information on options can be found here
    https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values

    Parameters:
        model_name (str): The ollama name of the model.
        base_url (str): The base URL of the Ollama API.
        json_mode (bool): Whether to use JSON mode.
        num_ctx (int): The context length for the model.
        kwargs (Any): Additional options to pass to the model.
    """

    self.url = base_url
    self.model_name = model_name
    self.image_size = image_size
    self.kwargs = {"options": kwargs}

    if json_mode:
        self.kwargs["format"] = "json"  # type: ignore
    self.kwargs["options"]["num_cxt"] = num_ctx

url instance-attribute

url = base_url

model_name instance-attribute

model_name = model_name

image_size instance-attribute

image_size = image_size

kwargs instance-attribute

kwargs = {'options': kwargs}

chat

chat(chat, **kwargs)

Chat with the LMM model.

PARAMETER DESCRIPTION
chat

A list of dictionaries containing the chat messages. The messages can be in the format: [{"role": "user", "content": "Hello!"}, ...] or if it contains media, it should be in the format: [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]

TYPE: Sequence[Dict[str, str]]

Source code in vision_agent/lmm/lmm.py
def chat(
    self,
    chat: Sequence[Message],
    **kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
    """Chat with the LMM model.

    Parameters:
        chat (Sequence[Dict[str, str]]): A list of dictionaries containing the chat
            messages. The messages can be in the format:
            [{"role": "user", "content": "Hello!"}, ...]
            or if it contains media, it should be in the format:
            [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
    """
    fixed_chat = []
    for message in chat:
        if "media" in message:
            resize = kwargs["resize"] if "resize" in kwargs else self.image_size
            message["images"] = [
                encode_media(cast(str, m), resize=resize) for m in message["media"]
            ]
            del message["media"]
        fixed_chat.append(message)
    url = f"{self.url}/chat"
    model = self.model_name
    messages = fixed_chat
    data: Dict[str, Any] = {"model": model, "messages": messages}

    tmp_kwargs = self.kwargs | kwargs
    data.update(tmp_kwargs)
    if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
        json_data = json.dumps(data)

        def f() -> Iterator[Optional[str]]:
            with requests.post(url, data=json_data, stream=True) as stream:
                if stream.status_code != 200:
                    raise ValueError(
                        f"Request failed with status code {stream.status_code}"
                    )

                for chunk in stream.iter_content(chunk_size=None):
                    chunk_data = json.loads(chunk)
                    if chunk_data["done"]:
                        yield None
                    else:
                        yield chunk_data["message"]["content"]

        return f()
    else:
        data["stream"] = False
        json_data = json.dumps(data)
        resp = requests.post(url, data=json_data)

        if resp.status_code != 200:
            raise ValueError(f"Request failed with status code {resp.status_code}")
        resp = resp.json()
        return resp["message"]["content"]  # type: ignore

generate

generate(prompt, media=None, **kwargs)
Source code in vision_agent/lmm/lmm.py
def generate(
    self,
    prompt: str,
    media: Optional[Sequence[Union[str, Path]]] = None,
    **kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
    url = f"{self.url}/generate"
    data: Dict[str, Any] = {
        "model": self.model_name,
        "prompt": prompt,
        "images": [],
    }

    if media and len(media) > 0:
        for m in media:
            resize = kwargs["resize"] if "resize" in kwargs else self.image_size
            data["images"].append(encode_media(m, resize=resize))

    tmp_kwargs = self.kwargs | kwargs
    data.update(tmp_kwargs)
    if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
        json_data = json.dumps(data)

        def f() -> Iterator[Optional[str]]:
            with requests.post(url, data=json_data, stream=True) as stream:
                if stream.status_code != 200:
                    raise ValueError(
                        f"Request failed with status code {stream.status_code}"
                    )

                for chunk in stream.iter_content(chunk_size=None):
                    chunk_data = json.loads(chunk)
                    if chunk_data["done"]:
                        yield None
                    else:
                        yield chunk_data["response"]

        return f()
    else:
        data["stream"] = False
        json_data = json.dumps(data)
        resp = requests.post(url, data=json_data)

        if resp.status_code != 200:
            raise ValueError(f"Request failed with status code {resp.status_code}")

        resp = resp.json()
        return resp["response"]  # type: ignore

vision_agent.lmm.AnthropicLMM

AnthropicLMM(
    api_key=None,
    model_name="claude-3-5-sonnet-20240620",
    max_tokens=4096,
    image_size=768,
    **kwargs
)

Bases: LMM

An LMM class for Anthropic's LMMs.

Source code in vision_agent/lmm/lmm.py
def __init__(
    self,
    api_key: Optional[str] = None,
    model_name: str = "claude-3-5-sonnet-20240620",
    max_tokens: int = 4096,
    image_size: int = 768,
    **kwargs: Any,
):
    self.client = anthropic.Anthropic(api_key=api_key)
    self.image_size = image_size
    self.model_name = model_name
    if "max_tokens" not in kwargs:
        kwargs["max_tokens"] = max_tokens
    self.kwargs = kwargs

client instance-attribute

client = Anthropic(api_key=api_key)

image_size instance-attribute

image_size = image_size

model_name instance-attribute

model_name = model_name

kwargs instance-attribute

kwargs = kwargs

chat

chat(chat, **kwargs)
Source code in vision_agent/lmm/lmm.py
def chat(
    self,
    chat: Sequence[Dict[str, Any]],
    **kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
    messages: List[MessageParam] = []
    for msg in chat:
        content: List[Union[TextBlockParam, ImageBlockParam]] = [
            TextBlockParam(type="text", text=msg["content"])
        ]
        if "media" in msg:
            for media_path in msg["media"]:
                resize = kwargs["resize"] if "resize" in kwargs else self.image_size
                encoded_media = encode_media(media_path, resize=resize)
                if encoded_media.startswith("data:image/png;base64,"):
                    encoded_media = encoded_media[len("data:image/png;base64,") :]
                content.append(
                    ImageBlockParam(
                        type="image",
                        source={
                            "type": "base64",
                            "media_type": "image/png",
                            "data": encoded_media,
                        },
                    )
                )
        messages.append({"role": msg["role"], "content": content})

    # prefers kwargs from second dictionary over first
    tmp_kwargs = self.kwargs | kwargs
    response = self.client.messages.create(
        model=self.model_name, messages=messages, **tmp_kwargs
    )
    if "stream" in tmp_kwargs and tmp_kwargs["stream"]:

        def f() -> Iterator[Optional[str]]:
            for chunk in response:
                if (
                    chunk.type == "message_start"
                    or chunk.type == "content_block_start"
                ):
                    continue
                elif chunk.type == "content_block_delta":
                    yield chunk.delta.text
                elif chunk.type == "message_stop":
                    yield None

        return f()
    else:
        return cast(str, response.content[0].text)

generate

generate(prompt, media=None, **kwargs)
Source code in vision_agent/lmm/lmm.py
def generate(
    self,
    prompt: str,
    media: Optional[Sequence[Union[str, Path]]] = None,
    **kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
    content: List[Union[TextBlockParam, ImageBlockParam]] = [
        TextBlockParam(type="text", text=prompt)
    ]
    if media:
        for m in media:
            resize = kwargs["resize"] if "resize" in kwargs else self.image_size
            encoded_media = encode_media(m, resize=resize)
            if encoded_media.startswith("data:image/png;base64,"):
                encoded_media = encoded_media[len("data:image/png;base64,") :]
            content.append(
                ImageBlockParam(
                    type="image",
                    source={
                        "type": "base64",
                        "media_type": "image/png",
                        "data": encoded_media,
                    },
                )
            )

    # prefers kwargs from second dictionary over first
    tmp_kwargs = self.kwargs | kwargs
    response = self.client.messages.create(
        model=self.model_name,
        messages=[{"role": "user", "content": content}],
        **tmp_kwargs,
    )
    if "stream" in tmp_kwargs and tmp_kwargs["stream"]:

        def f() -> Iterator[Optional[str]]:
            for chunk in response:
                if (
                    chunk.type == "message_start"
                    or chunk.type == "content_block_start"
                ):
                    continue
                elif chunk.type == "content_block_delta":
                    yield chunk.delta.text
                elif chunk.type == "message_stop":
                    yield None

        return f()
    else:
        return cast(str, response.content[0].text)