Skip to content

vision_agent.sim

vision_agent.sim.AzureSim

AzureSim(
    df,
    sim_key=None,
    api_key=None,
    api_version="2024-02-01",
    azure_endpoint=None,
    model=None,
)

Bases: Sim

Source code in vision_agent/sim/sim.py
def __init__(
    self,
    df: pd.DataFrame,
    sim_key: Optional[str] = None,
    api_key: Optional[str] = None,
    api_version: str = "2024-02-01",
    azure_endpoint: Optional[str] = None,
    model: Optional[str] = None,
) -> None:
    if not api_key:
        api_key = os.getenv("AZURE_OPENAI_API_KEY")
    if not azure_endpoint:
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    if not model:
        model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME")

    if not api_key:
        raise ValueError("Azure OpenAI API key is required.")
    if not azure_endpoint:
        raise ValueError("Azure OpenAI endpoint is required.")
    if not model:
        raise ValueError(
            "Azure OpenAI embedding model deployment name is required."
        )

    self.df = df
    client = AzureOpenAI(
        api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
    )
    self.emb_call = (
        lambda text: client.embeddings.create(input=text, model=model)
        .data[0]
        .embedding
    )

    self.model = model
    if "embs" not in df.columns and sim_key is None:
        raise ValueError("key is required if no column 'embs' is present.")

    if sim_key is not None:
        self.df = self.df.assign(
            embs=self.df[sim_key].apply(
                lambda x: get_embedding(
                    self.emb_call,
                    x,
                )
            )
        )

client instance-attribute

client = OpenAI(api_key=api_key)

df instance-attribute

df = df

emb_call instance-attribute

emb_call = lambda text: embedding

model instance-attribute

model = model

save

save(save_dir)
Source code in vision_agent/sim/sim.py
def save(self, save_dir: Union[str, Path]) -> None:
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    df = self.df.copy()
    embs = np.array(df.embs.tolist())
    np.save(save_dir / "embs.npy", embs)
    df = df.drop("embs", axis=1)
    df.to_csv(save_dir / "df.csv", index=False)

load staticmethod

load(
    load_dir, api_key=None, model="text-embedding-3-small"
)
Source code in vision_agent/sim/sim.py
@staticmethod
def load(
    load_dir: Union[str, Path],
    api_key: Optional[str] = None,
    model: str = "text-embedding-3-small",
) -> "Sim":
    load_dir = Path(load_dir)
    df = pd.read_csv(load_dir / "df.csv")
    embs = np.load(load_dir / "embs.npy")
    df["embs"] = list(embs)
    return Sim(df, api_key=api_key, model=model)

check_load staticmethod

check_load(load_dir, df)
Source code in vision_agent/sim/sim.py
@staticmethod
def check_load(
    load_dir: Union[str, Path],
    df: pd.DataFrame,
) -> bool:
    load_dir = Path(load_dir)
    if (
        not Path(load_dir / "df.csv").exists()
        or not Path(load_dir / "embs.npy").exists()
    ):
        return False

    df_load = pd.read_csv(load_dir / "df.csv")
    if platform.system() == "Windows":
        df_load = df_load.assign(
            doc=df_load.doc.apply(lambda x: x.replace("\r", ""))
        )
    return df.equals(df_load)  # type: ignore

top_k cached

top_k(query, k=5, thresh=None)

Returns the top k most similar items to the query.

PARAMETER DESCRIPTION
query

str: The query to compare to.

TYPE: str

k

int: The number of items to return.

TYPE: int DEFAULT: 5

thresh

Optional[float]: The minimum similarity threshold.

TYPE: Optional[float] DEFAULT: None

RETURNS DESCRIPTION
Sequence[Dict]

Sequence[Dict]: The top k most similar items.

Source code in vision_agent/sim/sim.py
@lru_cache(maxsize=256)
def top_k(
    self,
    query: str,
    k: int = 5,
    thresh: Optional[float] = None,
) -> Sequence[Dict]:
    """Returns the top k most similar items to the query.

    Parameters:
        query: str: The query to compare to.
        k: int: The number of items to return.
        thresh: Optional[float]: The minimum similarity threshold.

    Returns:
        Sequence[Dict]: The top k most similar items.
    """

    embedding = get_embedding(
        self.emb_call,
        query,
    )
    self.df = self.df.assign(
        sim=self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
    )
    res = self.df.sort_values("sim", ascending=False).head(k)
    if thresh is not None:
        res = res[res.sim > thresh]
    return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")

vision_agent.sim.OllamaSim

OllamaSim(df, sim_key=None, model_name=None, base_url=None)

Bases: Sim

Source code in vision_agent/sim/sim.py
def __init__(
    self,
    df: pd.DataFrame,
    sim_key: Optional[str] = None,
    model_name: Optional[str] = None,
    base_url: Optional[str] = None,
) -> None:
    self.df = df
    if base_url is None:
        base_url = "http://localhost:11434/api/embeddings"
    if model_name is None:
        model_name = "mxbai-embed-large"

    def emb_call(text: List[str]) -> List[float]:
        resp = requests.post(
            base_url, json={"prompt": text[0], "model": model_name}
        )
        return resp.json()["embedding"]  # type: ignore

    self.emb_call = emb_call

    if "embs" not in df.columns and sim_key is None:
        raise ValueError("key is required if no column 'embs' is present.")

    if sim_key is not None:
        self.df = self.df.assign(
            embs=self.df[sim_key].apply(
                lambda x: get_embedding(
                    self.emb_call,
                    x,
                )
            )
        )

client instance-attribute

client = OpenAI(api_key=api_key)

model instance-attribute

model = model

df instance-attribute

df = df

emb_call instance-attribute

emb_call = emb_call

save

save(save_dir)
Source code in vision_agent/sim/sim.py
def save(self, save_dir: Union[str, Path]) -> None:
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    df = self.df.copy()
    embs = np.array(df.embs.tolist())
    np.save(save_dir / "embs.npy", embs)
    df = df.drop("embs", axis=1)
    df.to_csv(save_dir / "df.csv", index=False)

load staticmethod

load(
    load_dir, api_key=None, model="text-embedding-3-small"
)
Source code in vision_agent/sim/sim.py
@staticmethod
def load(
    load_dir: Union[str, Path],
    api_key: Optional[str] = None,
    model: str = "text-embedding-3-small",
) -> "Sim":
    load_dir = Path(load_dir)
    df = pd.read_csv(load_dir / "df.csv")
    embs = np.load(load_dir / "embs.npy")
    df["embs"] = list(embs)
    return Sim(df, api_key=api_key, model=model)

check_load staticmethod

check_load(load_dir, df)
Source code in vision_agent/sim/sim.py
@staticmethod
def check_load(
    load_dir: Union[str, Path],
    df: pd.DataFrame,
) -> bool:
    load_dir = Path(load_dir)
    if (
        not Path(load_dir / "df.csv").exists()
        or not Path(load_dir / "embs.npy").exists()
    ):
        return False

    df_load = pd.read_csv(load_dir / "df.csv")
    if platform.system() == "Windows":
        df_load = df_load.assign(
            doc=df_load.doc.apply(lambda x: x.replace("\r", ""))
        )
    return df.equals(df_load)  # type: ignore

top_k cached

top_k(query, k=5, thresh=None)

Returns the top k most similar items to the query.

PARAMETER DESCRIPTION
query

str: The query to compare to.

TYPE: str

k

int: The number of items to return.

TYPE: int DEFAULT: 5

thresh

Optional[float]: The minimum similarity threshold.

TYPE: Optional[float] DEFAULT: None

RETURNS DESCRIPTION
Sequence[Dict]

Sequence[Dict]: The top k most similar items.

Source code in vision_agent/sim/sim.py
@lru_cache(maxsize=256)
def top_k(
    self,
    query: str,
    k: int = 5,
    thresh: Optional[float] = None,
) -> Sequence[Dict]:
    """Returns the top k most similar items to the query.

    Parameters:
        query: str: The query to compare to.
        k: int: The number of items to return.
        thresh: Optional[float]: The minimum similarity threshold.

    Returns:
        Sequence[Dict]: The top k most similar items.
    """

    embedding = get_embedding(
        self.emb_call,
        query,
    )
    self.df = self.df.assign(
        sim=self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
    )
    res = self.df.sort_values("sim", ascending=False).head(k)
    if thresh is not None:
        res = res[res.sim > thresh]
    return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")

vision_agent.sim.Sim

Sim(
    df,
    sim_key=None,
    api_key=None,
    model="text-embedding-3-small",
)

Creates a similarity object that can be used to find similar items in a dataframe.

PARAMETER DESCRIPTION
df

pd.DataFrame: The dataframe to use for similarity.

TYPE: DataFrame

sim_key

Optional[str]: The column name that you want to use to construct the embeddings.

TYPE: Optional[str] DEFAULT: None

api_key

Optional[str]: The OpenAI API key to use for embeddings.

TYPE: Optional[str] DEFAULT: None

model

str: The model to use for embeddingshttps://github.com/landing-ai/vision-agent/pull/280.

TYPE: str DEFAULT: 'text-embedding-3-small'

Source code in vision_agent/sim/sim.py
def __init__(
    self,
    df: pd.DataFrame,
    sim_key: Optional[str] = None,
    api_key: Optional[str] = None,
    model: str = "text-embedding-3-small",
) -> None:
    """Creates a similarity object that can be used to find similar items in a
    dataframe.

    Parameters:
        df: pd.DataFrame: The dataframe to use for similarity.
        sim_key: Optional[str]: The column name that you want to use to construct
            the embeddings.
        api_key: Optional[str]: The OpenAI API key to use for embeddings.
        model: str: The model to use for embeddingshttps://github.com/landing-ai/vision-agent/pull/280.
    """
    self.df = df
    self.client = OpenAI(api_key=api_key)
    self.emb_call = (
        lambda x: self.client.embeddings.create(input=x, model=model)
        .data[0]
        .embedding
    )
    self.model = model
    if "embs" not in df.columns and sim_key is None:
        raise ValueError("key is required if no column 'embs' is present.")

    if sim_key is not None:
        self.df = self.df.assign(
            embs=self.df[sim_key].apply(
                lambda x: get_embedding(
                    self.emb_call,
                    x,
                )
            )
        )

df instance-attribute

df = df

client instance-attribute

client = OpenAI(api_key=api_key)

emb_call instance-attribute

emb_call = lambda x: embedding

model instance-attribute

model = model

save

save(save_dir)
Source code in vision_agent/sim/sim.py
def save(self, save_dir: Union[str, Path]) -> None:
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    df = self.df.copy()
    embs = np.array(df.embs.tolist())
    np.save(save_dir / "embs.npy", embs)
    df = df.drop("embs", axis=1)
    df.to_csv(save_dir / "df.csv", index=False)

load staticmethod

load(
    load_dir, api_key=None, model="text-embedding-3-small"
)
Source code in vision_agent/sim/sim.py
@staticmethod
def load(
    load_dir: Union[str, Path],
    api_key: Optional[str] = None,
    model: str = "text-embedding-3-small",
) -> "Sim":
    load_dir = Path(load_dir)
    df = pd.read_csv(load_dir / "df.csv")
    embs = np.load(load_dir / "embs.npy")
    df["embs"] = list(embs)
    return Sim(df, api_key=api_key, model=model)

check_load staticmethod

check_load(load_dir, df)
Source code in vision_agent/sim/sim.py
@staticmethod
def check_load(
    load_dir: Union[str, Path],
    df: pd.DataFrame,
) -> bool:
    load_dir = Path(load_dir)
    if (
        not Path(load_dir / "df.csv").exists()
        or not Path(load_dir / "embs.npy").exists()
    ):
        return False

    df_load = pd.read_csv(load_dir / "df.csv")
    if platform.system() == "Windows":
        df_load = df_load.assign(
            doc=df_load.doc.apply(lambda x: x.replace("\r", ""))
        )
    return df.equals(df_load)  # type: ignore

top_k cached

top_k(query, k=5, thresh=None)

Returns the top k most similar items to the query.

PARAMETER DESCRIPTION
query

str: The query to compare to.

TYPE: str

k

int: The number of items to return.

TYPE: int DEFAULT: 5

thresh

Optional[float]: The minimum similarity threshold.

TYPE: Optional[float] DEFAULT: None

RETURNS DESCRIPTION
Sequence[Dict]

Sequence[Dict]: The top k most similar items.

Source code in vision_agent/sim/sim.py
@lru_cache(maxsize=256)
def top_k(
    self,
    query: str,
    k: int = 5,
    thresh: Optional[float] = None,
) -> Sequence[Dict]:
    """Returns the top k most similar items to the query.

    Parameters:
        query: str: The query to compare to.
        k: int: The number of items to return.
        thresh: Optional[float]: The minimum similarity threshold.

    Returns:
        Sequence[Dict]: The top k most similar items.
    """

    embedding = get_embedding(
        self.emb_call,
        query,
    )
    self.df = self.df.assign(
        sim=self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
    )
    res = self.df.sort_values("sim", ascending=False).head(k)
    if thresh is not None:
        res = res[res.sim > thresh]
    return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")

vision_agent.sim.StellaSim

StellaSim(df, sim_key=None)

Bases: Sim

Source code in vision_agent/sim/sim.py
def __init__(
    self,
    df: pd.DataFrame,
    sim_key: Optional[str] = None,
) -> None:
    self.df = df

    def emb_call(text: List[str]) -> List[float]:
        return stella_embeddings(text)[0]  # type: ignore

    self.emb_call = emb_call

    if "embs" not in df.columns and sim_key is None:
        raise ValueError("key is required if no column 'embs' is present.")

    if sim_key is not None:
        self.df = self.df.assign(
            embs=self.df[sim_key].apply(
                lambda x: get_embedding(
                    self.emb_call,
                    x,
                )
            )
        )

client instance-attribute

client = OpenAI(api_key=api_key)

model instance-attribute

model = model

df instance-attribute

df = df

emb_call instance-attribute

emb_call = emb_call

save

save(save_dir)
Source code in vision_agent/sim/sim.py
def save(self, save_dir: Union[str, Path]) -> None:
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    df = self.df.copy()
    embs = np.array(df.embs.tolist())
    np.save(save_dir / "embs.npy", embs)
    df = df.drop("embs", axis=1)
    df.to_csv(save_dir / "df.csv", index=False)

check_load staticmethod

check_load(load_dir, df)
Source code in vision_agent/sim/sim.py
@staticmethod
def check_load(
    load_dir: Union[str, Path],
    df: pd.DataFrame,
) -> bool:
    load_dir = Path(load_dir)
    if (
        not Path(load_dir / "df.csv").exists()
        or not Path(load_dir / "embs.npy").exists()
    ):
        return False

    df_load = pd.read_csv(load_dir / "df.csv")
    if platform.system() == "Windows":
        df_load = df_load.assign(
            doc=df_load.doc.apply(lambda x: x.replace("\r", ""))
        )
    return df.equals(df_load)  # type: ignore

top_k cached

top_k(query, k=5, thresh=None)

Returns the top k most similar items to the query.

PARAMETER DESCRIPTION
query

str: The query to compare to.

TYPE: str

k

int: The number of items to return.

TYPE: int DEFAULT: 5

thresh

Optional[float]: The minimum similarity threshold.

TYPE: Optional[float] DEFAULT: None

RETURNS DESCRIPTION
Sequence[Dict]

Sequence[Dict]: The top k most similar items.

Source code in vision_agent/sim/sim.py
@lru_cache(maxsize=256)
def top_k(
    self,
    query: str,
    k: int = 5,
    thresh: Optional[float] = None,
) -> Sequence[Dict]:
    """Returns the top k most similar items to the query.

    Parameters:
        query: str: The query to compare to.
        k: int: The number of items to return.
        thresh: Optional[float]: The minimum similarity threshold.

    Returns:
        Sequence[Dict]: The top k most similar items.
    """

    embedding = get_embedding(
        self.emb_call,
        query,
    )
    self.df = self.df.assign(
        sim=self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
    )
    res = self.df.sort_values("sim", ascending=False).head(k)
    if thresh is not None:
        res = res[res.sim > thresh]
    return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")

load staticmethod

load(load_dir, api_key=None, model='stella1.5b')
Source code in vision_agent/sim/sim.py
@staticmethod
def load(
    load_dir: Union[str, Path],
    api_key: Optional[str] = None,
    model: str = "stella1.5b",
) -> "StellaSim":
    load_dir = Path(load_dir)
    df = pd.read_csv(load_dir / "df.csv")
    embs = np.load(load_dir / "embs.npy")
    df["embs"] = list(embs)
    return StellaSim(df)