samurai 读取mp4 sam2读取视频-CSDN博客

本文链接：https://blog.csdn.net/jacke121/article/details/149158520

原版代码decord：

def load_video_frames_from_video_file(
    video_path,
    image_size,
    offload_video_to_cpu,
    img_mean=(0.485, 0.456, 0.406),
    img_std=(0.229, 0.224, 0.225),
    compute_device=torch.device("cuda"),
):
    """Load the video frames from a video file."""
    import decord

    img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
    img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
    # Get the original video height and width
    decord.bridge.set_bridge("torch")
    video_height, video_width, _ = decord.VideoReader(video_path).next().shape
    # Iterate over all frames in the video
    images = []
    for frame in decord.VideoReader(video_path, width=image_size, height=image_size):
        images.append(frame.permute(2, 0, 1))

    images = torch.stack(images, dim=0).float() / 255.0
    if not offload_video_to_cpu:
        images = images.to(compute_device)
        img_mean = img_mean.to(compute_device)
        img_std = img_std.to(compute_device)
    # normalize by mean and std
    images -= img_mean
    images /= img_std
    return images, video_height, video_width

新代码opencv：

import cv2
import torch
import numpy as np

def load_video_frames_from_video_file(
    video_path,
    image_size,
    offload_video_to_cpu,
    img_mean=(0.485, 0.456, 0.406),
    img_std=(0.229, 0.224, 0.225),
    compute_device=torch.device("cuda"),
):
    """Use OpenCV to load video frames and return normalized torch tensor."""
    
    # 转成 tensor 格式 (3, 1, 1)，便于广播
    img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
    img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise IOError(f"Cannot open video: {video_path}")

    video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    images = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Resize 到 (image_size, image_size)
        frame = cv2.resize(frame, (image_size, image_size))
        # BGR → RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # 转成 torch tensor，shape: (H, W, 3)
        frame_tensor = torch.from_numpy(frame).float() / 255.0  # [0, 1]
        # (H, W, 3) → (3, H, W)
        frame_tensor = frame_tensor.permute(2, 0, 1)
        images.append(frame_tensor)

    cap.release()

    images = torch.stack(images, dim=0)  # (N, 3, H, W)

    if not offload_video_to_cpu:
        images = images.to(compute_device)
        img_mean = img_mean.to(compute_device)
        img_std = img_std.to(compute_device)

    # normalize
    images = (images - img_mean) / img_std

    return images, video_height, video_width

decord 和opencv处理结果不同：


import cv2
import torch
import numpy as np
import decord

def load_video_frames_from_video_file(
        video_path,
        image_size,
        offload_video_to_cpu,
        img_mean=(0.485, 0.456, 0.406),
        img_std=(0.229, 0.224, 0.225),
        compute_device=torch.device("cuda"),
):
    """Load the video frames from a video file."""

    img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
    img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
    # Get the original video height and width
    decord.bridge.set_bridge("torch")
    video_height, video_width, _ = decord.VideoReader(video_path).next().shape
    # Iterate over all frames in the video
    images = []
    for frame in decord.VideoReader(video_path, width=image_size, height=image_size):
        images.append(frame.permute(2, 0, 1))

    images = torch.stack(images, dim=0).float() / 255.0
    if not offload_video_to_cpu:
        images = images.to(compute_device)
        img_mean = img_mean.to(compute_device)
        img_std = img_std.to(compute_device)
    # normalize by mean and std
    images -= img_mean
    images /= img_std
    return images, video_height, video_width


def load_video_frames_from_video_file_cv(
        video_path,
        image_size,
        offload_video_to_cpu,
        img_mean=(0.485, 0.456, 0.406),
        img_std=(0.229, 0.224, 0.225),
        compute_device=torch.device("cuda"),
):
    """Use OpenCV to load video frames and return normalized torch tensor."""

    # 转成 tensor 格式 (3, 1, 1)，便于广播
    img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
    img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise IOError(f"Cannot open video: {video_path}")

    video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    images = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Resize 到 (image_size, image_size)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (image_size, image_size))
        # BGR → RGB

        # 转成 torch tensor，shape: (H, W, 3)
        frame_tensor = torch.from_numpy(frame).float() / 255.0  # [0, 1]
        # (H, W, 3) → (3, H, W)
        frame_tensor = frame_tensor.permute(2, 0, 1)
        images.append(frame_tensor)

    cap.release()

    images = torch.stack(images, dim=0)  # (N, 3, H, W)

    if not offload_video_to_cpu:
        images = images.to(compute_device)
        img_mean = img_mean.to(compute_device)
        img_std = img_std.to(compute_device)

    # normalize
    images = (images - img_mean) / img_std

    return images, video_height, video_width

if __name__ == '__main__':

    path_mp4=r"F:\data\0624_key\record(3)\record\0624_1429_0.mp4"
    image_size= 1024
    offload_video_to_cpu = 0
    images, video_height, video_width =load_video_frames_from_video_file(path_mp4,image_size,offload_video_to_cpu)

    images1, video_height1, video_width1 = load_video_frames_from_video_file_cv(path_mp4,image_size,offload_video_to_cpu)

    print(torch.equal(images[0], images1[0]))

    diff=images[0]-images1[0]
    print(diff.max(),diff.min())

    frame_decord = (images[0].clamp(0, 1).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
    frame_opencv = (images1[0].clamp(0, 1).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)

    # 2. 计算差异图
    diff = cv2.absdiff(frame_decord, frame_opencv)  # 逐像素绝对差值

    # 3. 拼接图像做对比
    vis = np.concatenate([frame_decord, frame_opencv, diff], axis=1)

    # 4. 显示图像
    cv2.imshow("Decord | OpenCV | Diff", vis)
    cv2.waitKey(0)