CLIPMediaSim#
Video similarity#
import cv2
from PIL import Image
from vision_agent_tools.models.clip_media_sim import CLIPMediaSim
# Path to your target image
image_path = "path/to/your/image.jpg"
# Path to your video
video_path = "path/to/your/video.mp4"
# Load the image
target_image = Image.open(image_path)
# Load the video into frames
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frames = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frames.append(frame)
cap.release()
# Calculate video timestamps
video_time = len(frames) / fps
# Create the CLIPMediaSim instance
clip_media_sim = CLIPMediaSim()
# Run video similarity against the target image
results = clip_media_sim(video=frames, target_image=target_image)
# The results should be a list of [index_of_frame, confidence_score] where the
# video is similar to the target image.
# To find the time at which a given frame happens, you can do the following
time_per_frame = video_time / len(frames)
timestamp = results[0][0] * time_per_frame
print("Similarity detection complete!")
You can also run similarity against a target text doing the following:
CLIPMediaSim
#
Bases: BaseMLModel
A class that receives a video and a target image or text and returns the frames that are most similar to the target.
__call__(video, target_image=None, target_text=None, thresh=0.3)
#
Receives a video and a target image or text and returns the frames that are most similar to the target.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
video |
ndarray
|
The input video to be processed. |
required |
target_image |
Image | None
|
The target image to compare the video frames with. |
None
|
target_text |
str | None
|
The target text to compare the video frames with. |
None
|
thresh |
float
|
The threshold to filter the results. Defaults to 0.3. |
0.3
|
__init__(device='cuda')
#
Initializes the CLIPMediaSim object with a pre-trained CLIP model.