【Python】動画内の指定オブジェクトを追跡して自動トリミング（3）

2024年4月5日 12:21

【状況】特定した動画の領域を切り抜いて動画にしたい
【対処】ffmpegを使って切り抜いた動画を作成

前回，指定したオブジェクトが動く範囲の領域を求めたので，その領域だけを対象にmp4エンコードします．以前に作ったトリミング方法を利用します．

画像サイズの補正

mp4にエンコードする際，画像サイズの縦横が16の倍数であることが望ましいので，値の補正を行います．ここでは，簡単にするため，領域右下の位置を縮める方法にします．幅と高さについて，//を用いて整数除算（商が整数のみとなる）し，16倍すれば，16の倍数となりますね．

# 幅と高さを求める
width = max_x - min_x + 1
height = max_y - min_y + 1

# 幅と高さを16の倍数に調整（縮む方向）
new_width = width // 16 * 16
new_height = height // 16 * 16

# 画像の右下の座標位置を16の倍数になるように削る
max_x_new = min_x + new_width - 1
max_y_new = min_y + new_height - 1

切り抜いてmp4エンコード

以前に作った切り抜き関数crop_and_encode，get_bitrateがあるので，それを使います．ビットレートはとりあえず半分にしてあります（計算をサボった）．
（以前にmoviepyを使って切り抜くプログラムを作っていたのでそれを使って結果を作っていたところ，ライブラリの不具合が判明し，ffmpegに乗り換えるというバタバタがあったので，若干？なタイトルになっています）

org_bitrate = get_bitrate(video_path)   # 元データのビットレート
new_bitrate = org_bitrate / 2           # 切り抜き動画のビットレート（とりあえず半分）
crop_and_encode(video_path, min_x, min_y, max_x_new, max_y_new, "out.mp4", new_bitrate)

切り抜き結果

トラッキング対象のid1, id2は以下の通りです．

切り抜き画像を見ると，元の領域（白線）より，エンコードの領域（赤線）が少し内側にあることが分かります．変換前後の画像サイズは，
id1: (353, 320)　→　(352, 320)
id2: (403, 206)　→　(400, 256)
となっていて，16の倍数になっていることが確認できました（id1は，画像ではほとんど境界線の違いが分からない）．

また，切り抜いた動画の最初と最後のフレームを確認します．id1，id2ともに，端から端まで動いている様子を捕らえていることが分かります．

これで，ひとまず目的は達成しました♪

コード

from ultralytics import YOLO

import cv2
import ffmpeg

# 元の動画のビットレートを取得
def get_bitrate(file_path):                     # ファイルパスを引数とする
    # 動画ファイルの情報を取得  
    video_info = ffmpeg.probe(file_path)
    video_stream = next((stream for stream in video_info['streams'] if stream['codec_type'] == 'video'), None)

    # ビットレートを取得
    bitrate = int(video_stream['bit_rate'])
   
    return bitrate

# 元の動画ファイルから切り抜いて，指定したビットレートでエンコードする
def crop_and_encode(input_file, x1, y1, x2, y2, output_file, bitrate):      
    # 幅，高さを求める
    width = x2 - x1  + 1
    height = y2 - y1 + 1

    input = ffmpeg.input(input_file)                    # 入力ファイル
    video = ffmpeg.crop(input, x1, y1, width, height)   # 左上座標と，幅・高さを与えてinputから切り抜く
    audio = input.audio                                 # 音声ファイルも一応inputからaudioに切り出す
    out = ffmpeg.output(audio, video, output_file, video_bitrate = bitrate) # 音声，画像，ビットレートを指定してエンコードする   
    ffmpeg.run(out, overwrite_output=True)  # 動画生成の実行（既存ファイルがある場合は上書き）

# Load the YOLOv8 model
model = YOLO('yolov8n.pt')

# Open the video file
video_path = "walk.mp4"
cap = cv2.VideoCapture(video_path)

success, frame = cap.read()
if not success :    # 読み込みに失敗したら終了
    exit()
    
# 最初のフレームから，トラッキング対象を選択する
results = model.track(frame, persist=True)  # オブジェクト抽出結果を得る

cv2.imshow("YOLOv8 Tracking", results[0].plot())
cv2.waitKey(1)  # ウェイトを入れないと画面が更新されない

print("処理対象の[番号]を入力：")
select_id = int(input())  # 数値を入力する 

# 最初のフレームをコピーして，ここに上書きしていく
tracking_frame = frame.copy()

# トラッキングしたオブジェクトの存在領域の初期値を設定（minには最大値，maxには最小値を入れる）
frame_width = frame.shape[1]
frame_height = frame.shape[0]
min_x, min_y, max_x, max_y = frame_width, frame_height, 0, 0

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        # Run YOLOv8 tracking on the frame, persisting tracks between frames
        results = model.track(frame, persist=True)
        
        # 検出したオブジェクト群から，トラッキング対象を探してバウンディングボックスを記録
        # 　boxes.xyxyがxy座標のセット，boxes.idが識別番号を保持
        for box, id in zip(results[0].boxes.xyxy, results[0].boxes.id):  
            if id == select_id: # 選択したオブジェクトのid
                box_x1, box_y1, box_x2, box_y2 = map(int, box)   # バウンディングボックスの座標を保存
            
                # バウンディングボックスを青線で表示
                cv2.rectangle(tracking_frame, (box_x1, box_y1), (box_x2, box_y2), (255, 0, 0), 2)
                
                # 最小値と最大値を更新
                min_x = min(box_x1, min_x)
                min_y = min(box_y1, min_y)
                max_x = max(box_x2, max_x)
                max_y = max(box_y2, max_y)
                break

        # 検出過程を描画
        cv2.imshow("YOLOv8 Tracking", tracking_frame)
        cv2.waitKey(1)  # ウェイトを入れないと画面が更新されない

    else:
        # Break the loop if the end of the video is reached
        break
    
# トラッキング領域全てを内包する長方形    
cv2.rectangle(tracking_frame, (min_x, min_y), (max_x, max_y), (255, 255, 255), 2) 

# 幅と高さを求める
width = max_x - min_x + 1
height = max_y - min_y + 1

# 幅と高さを16の倍数に調整（縮む方向；切り捨て除算をして16倍）
new_width = width // 16 * 16
new_height = height // 16 * 16

# 画像の右下の座標位置を16の倍数になるように削る
max_x_new = min_x + new_width - 1
max_y_new = min_y + new_height - 1

cv2.rectangle(tracking_frame, (min_x, min_y), (max_x_new, max_y_new), (0, 0, 255), 2) 

# 幅と高さを表示
print("変換前の幅:", width, "，高さ:", height)
new_width = max_x_new - min_x + 1
new_height = max_y_new - min_y + 1
print("変換後の幅:", new_width, "，高さ:", new_height)

# トラッキング結果を画像に保存
cv2.imwrite(f"clip_box.jpg", tracking_frame)

org_bitrate = get_bitrate(video_path)   # 元データのビットレート
new_bitrate = org_bitrate / 2           # 切り抜き動画のビットレート（とりあえず半分）
crop_and_encode(video_path, min_x, min_y, max_x_new, max_y_new, "out.mp4", new_bitrate)

# Release the video capture object and close the display window
cap.release()
cv2.destroyAllWindows()

この記事が気に入ったらサポートをしてみませんか？