Florence2Sam2#
This tool uses Florence2 and the SAM-2 model to do text to instance segmentation on image or video inputs.
import cv2
from vision_agent_tools.models.florence2_sam2 import Florence2SAM2
# Path to your video
video_path = "path/to/your/video.mp4"
# Load the video into frames
cap = cv2.VideoCapture(video_path)
frames = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frames.append(frame)
cap.release()
# Create the Florence2SAM2 instance
florence2_sam2 = Florence2SAM2()
# segment all the instances of the prompt "ball" for all video frames
results = florence2_sam2(prompt="ball", video=frames)
# Returns a list of list where the first list represents the frames and the inner
# list contains all the predictions per frame. The annotation ID can be used
# to track the same object across different frames. For example:
[
[
{
"id": 0
"mask": rle
"label": "ball"
"bbox": [x_min, y_min, x_max, y_max]
}
],
[
{
"id": 0
"mask": rle
"label": "ball"
"bbox": [x_min, y_min, x_max, y_max]
}
]
]
print("Instance segmentation complete!")
You can also run similarity against an image and get additionally bounding boxes doing the following:
Florence2SAM2
#
Bases: BaseMLModel
A class that receives a video or images, a text prompt and returns the instance segmentation based on the input for each frame.
__call__(prompt, images=None, video=None, *, chunk_length_frames=20, iou_threshold=0.6, nms_threshold=0.3)
#
Florence2Sam2 model find objects in images and track objects in a video.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
prompt |
str
|
The text input that complements the media to find or track objects. |
required |
images |
list[Image] | None
|
The images to be analyzed. |
None
|
video |
VideoNumpy | None
|
A numpy array containing the different images, representing the video. |
None
|
chunk_length_frames |
int | None
|
The number of frames for each chunk of video to analyze. The last chunk may have fewer frames. |
20
|
iou_threshold |
float
|
The IoU threshold value used to compare last_predictions and new_predictions objects. |
0.6
|
nms_threshold |
float
|
The non-maximum suppression threshold value used to filter the Florence2 predictions. |
0.3
|
Returns:
Type | Description |
---|---|
list[list[dict[str, Any]]]
|
list[list[dict[str, Any]]]: A list where each item represents each frames predictions. [[{ "id": 0, "mask": rle, "label": "car", "bbox": [0.1, 0.2, 0.3, 0.4] }]] |
__init__(model_config=Florence2SAM2Config())
#
Initializes the Florence2SAM2 object with a pre-trained Florence2 model and a SAM2 model.
fine_tune(checkpoint)
#
Load the fine-tuned Florence-2 model.
load_base()
#
Load the base Florence-2 model.