OWLv2 Open-World Localization#
This example demonstrates using the Owlv2 tool for object detection in images based on text prompts.
from vision_agent_tools.models.owlv2 import Owlv2
# (replace this path with your own!)
test_image = "path/to/your/image.jpg"
# What are you looking for? Write your detective prompts here!
prompts = ["a photo of a cat", "a photo of a dog"]
# Load the image and create your Owlv2 detective tool
image = Image.open(test_image)
owlv2 = Owlv2()
# Time to put Owlv2 to work! Let's see what it finds...
results = owlv2(image, prompts=prompts)[0]
# Did Owlv2 sniff out any objects? Let's see the results!
if results:
for detection in results:
print(f"Found it! It looks like a {detection['label']} with a confidence of {detection['score']:.2f}.")
print(f"Here's where it's hiding: {detection['bbox']}")
else:
print("Hmm, Owlv2 couldn't find anything this time. Maybe try a different prompt?")
Owlv2
#
Bases: BaseMLModel
Tool for object detection using the pre-trained Owlv2 model from Transformers.
This tool takes images and a prompt as input, performs object detection using the Owlv2 model, and returns a list of objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding a threshold.
__call__(prompts, images=None, video=None, *, batch_size=1, nms_threshold=0.3, confidence=0.1)
#
Performs object detection on images using the Owlv2 model.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
prompts |
list[str]
|
The prompt to be used for object detection. |
required |
images |
list[Image] | None
|
The images to be analyzed. |
None
|
video |
VideoNumpy[uint8] | None
|
A numpy array containing the different images, representing the video. |
None
|
batch_size |
int
|
The batch size used for processing multiple images or video frames. |
1
|
nms_threshold |
float
|
The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS). |
0.3
|
confidence |
float
|
Confidence threshold for model predictions. |
0.1
|
Returns:
Type | Description |
---|---|
list[ODWithScoreResponse]
|
list[ODWithScoreResponse]:
A list of |
__init__(model_config=OWLV2Config())
#
Loads the pre-trained Owlv2 processor and model from Transformers.
Owlv2ProcessorWithNMS
#
Bases: Owlv2Processor
post_process_object_detection_with_nms(outputs, *, threshold=0.1, nms_threshold=0.3, target_sizes=None)
#
Converts the raw output of [OwlViTForObjectDetection
] into final
bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
outputs |
OwlViTObjectDetectionOutput
|
Raw outputs of the model. |
required |
threshold |
float
|
Score threshold to keep object detection predictions. |
0.1
|
nms_threshold |
float
|
IoU threshold to filter overlapping objects the raw detections. |
0.3
|
target_sizes |
TensorType | list[Tuple] | None
|
Tensor of shape |
None
|
Returns:
list[dict]
:
A list of dictionaries, each dictionary containing the scores, labels
and boxes for an image in the batch as predicted by the model.