AdaMML配置使用

AdaMML模型是一个多模态视频分类的模型,如果你也有需求请随我一起看下去吧!

1. 下载

AdaMML在Github中具有源码,所以我们很简单的就下载下来了。

git clone https://github.com/IBM/AdaMML --depth=1

下面安装一下依赖库

pip3 install torch torchvision librosa tqdm Pillow numpy 

下载三个模态的预训练文件,官网 ,下载后放到pretrain文件夹下

RGB点我 Audio点我 Flow点我

2. 数据集准备

首先在跟目录下创建一个data文件夹,把我们视频数据集放入data文件夹下。最终效果如下所示:

1
2
3
4
# 执行命令生成RGB帧
python tools/extract_rgb.py data/videos/ data/rgbs/
# 执行命令生成音频
python tools/extract_audio.py data/videos/ data/audios/

AdaMML使用的是TSN来生成光流,或是使用RGB Diff近似作为光流,我自己写了一个程序来使用Farneback算法来生成光流,大家可以直接拷贝

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3

import argparse
import os
import skvideo.io
import concurrent.futures
import subprocess
import glob
from tqdm import tqdm
import cv2
import numpy as np

def optical_flow(file_path, targetdir, short_side):
    cap = cv2.VideoCapture(file_path)
    ret,frame1 = cap.read()

    prvs = cv2.cvtColor(frame1,cv2.COLOR_BGR2GRAY)
    prvs = cv2.resize(prvs, (256,256))

    hsv = np.zeros_like(frame1)
    hsv[...,1] = 255
    frame_count = 1
    while (1):
        ret1,frame2 = cap.read()
        if ret1:
            next = cv2.cvtColor(frame2,cv2.COLOR_BGR2GRAY)
            next_ = cv2.resize(next,(256,256))

            flow = cv2.calcOpticalFlowFarneback(prvs,next_,None,0.5,3,15,3,5,1.2,0)
            flow[...,0] = cv2.normalize(flow[...,0],None,0,255,cv2.NORM_MINMAX)
            flow[...,1] = cv2.normalize(flow[...,1],None,0,255,cv2.NORM_MINMAX)

            cv2.imwrite(f'{targetdir}/x_{frame_count:05d}.jpg',flow[...,0])
            cv2.imwrite(f'{targetdir}/y_{frame_count:05d}.jpg',flow[...,1])
            prvs = next_
            frame_count = frame_count+1
        else:
            break
    cap.release()


def video_to_flow(video, targetdir, short_side=256):
    filename = video
    output_foldername = os.path.join(targetdir, os.path.basename(video).split(".")[0])

    if not os.path.exists(filename):
        print(f"{filename} is not existed.")
        return video, False
    else:
        try:
            video_meta = skvideo.io.ffprobe(filename)
            height = int(video_meta['video']['@height'])
            width = int(video_meta['video']['@width'])
        except Exception as e:
            print(f"Can not get video info: {filename}, error {e}")
            return video, False

        if not os.path.exists(output_foldername):
            os.makedirs(output_foldername)

        optical_flow(video, output_foldername,short_side)

        return video, True


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('videos_dir', help='Input directory of videos with audio')
    parser.add_argument('output_dir', help='Output directory to store JPEG files')
    parser.add_argument('--num_workers', help='Number of workers', default=8, type=int)
    args = parser.parse_args()

    video_list = glob.glob(args.videos_dir + '/**/*.*', recursive=True)
    with concurrent.futures.ProcessPoolExecutor(max_workers=args.num_workers) as executor:
        futures = [executor.submit(video_to_flow, video, args.output_dir, 256)
                   for video in video_list]
        with tqdm(total=len(futures)) as t_bar:
            for future in concurrent.futures.as_completed(futures):
                video_id, success = future.result()
                if not success:
                    rint(f"Something wrong for {video_id}")
                t_bar.update()
    print("Completed")

使用下面命令,进行生成光流图,有点慢,客官可以坐下来喝杯茶等一等。

1
python tools/extract_flow.py data/videos/ data/flows/

规范化数据集,相同类别的放到一个文件夹中,并创建空的WAV文件

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os

train_scale = 0.7
modalities = ['rgbs', 'flows', 'audios']
data_classes = ["NonViolence", "Violence"]
pattern = ["NV_*", "V_*"]

def move_by_class(modaliy):
    data_path = f"data/{modaliy}/"
    for idx, class_ in enumerate(data_classes):
        dest = data_path + class_
        if not os.path.exists(dest):
            os.mkdir(dest)
        os.system(f"mv {data_path}/{pattern[idx]}  {dest}")

def touch():
    rgbs_data_path = f"data/rgbs/"
    audios_data_path = f"data/audios/"
    for idx, class_ in enumerate(data_classes):
        dest = os.path.join(rgbs_data_path, class_)
        for files in os.listdir(dest):
            touch_file = os.path.join(audios_data_path, class_, files)
            os.system(f"touch {touch_file}.wav")

for modaliy in modalities:
    move_by_class(modaliy)
print("Move Successful")

touch()
print("Touch Successful")

生成注释文件,删除空的WAV文件

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os

def get_annotation(modaliy):
    train_file_path = f"data/{modaliy}/train.txt"
    val_file_path = f"data/{modaliy}/val.txt"
    data_path = f"data/{modaliy}/"

    train_file_path_io = open(train_file_path, "w")
    val_file_path_io = open(val_file_path, "w")

    # 遍历每一类
    for one_class in data_classes:
        cur_path = os.path.join(data_path, one_class)
        files = os.listdir(cur_path)

        i = 0
        count = len(files) * train_scale

        if modaliy != "audios":
            files = sorted(files, key=lambda x: int(x.split("_")[1]))
        else:
            files = sorted(files, key=lambda x: int(x.split('.')[0].split("_")[1]))

        # 遍历每个文件夹看看大小
        for f in files:
            cur_path_detail = os.path.join(cur_path, f) if modaliy != "audios" else \
                os.path.join(cur_path, f).replace("audios", "rgbs").split(".")[0]

            size = len(os.listdir(cur_path_detail))
            write_path = os.path.join(one_class, f)

            label = 0 if one_class == "Violence" else 1

            if i < count:
                train_file_path_io.write(f"{write_path};1;{size};{label}\n")
            else:
                val_file_path_io.write(f"{write_path};1;{size};{label}\n")
            i += 1

    train_file_path_io.close()
    val_file_path_io.close()

train_scale = 0.7
modalities = ['rgbs', 'flows', 'audios']
data_classes = ["NonViolence", "Violence"]

for modaliy in modalities:
    get_annotation(modaliy)
print("Get Annotation Successful")


def delete():
    audios_data_path = f"data/audios/"
    for idx, class_ in enumerate(data_classes):
        dest = os.path.join(audios_data_path, class_)
        os.system(f"find {dest}/*  -type f -size 0c | xargs -n 1 rm -f")

delete()
print("Delete Successful")

配置datasets_config.py

1
2
3
4
5
6
7
8
9
    'two-class': {
        'num_classes': 31,
        'train_list_name': 'train.txt',
        'val_list_name': 'val.txt',
        'filename_seperator': ";",
        'image_tmpl': '{:05d}.jpg',
        'filter_video': 0,
        'label_file': 'categories.txt'
    }

修改源码

# 55行有问题需要替换
correct_k = correct[:k].contiguous().view(-1).float().sum(0, keepdim=True)

3. 模型训练

接下来,我们来训练自己的模型: 假如你只想使用RGB模态来训练模型,可以这样做:

python3 train.py --gpu_id 0 --backbone_net adamml -d 50 \
--groups 8 --frames_per_group 4 -b 2 -j 3 --epochs 20 --warmup_epochs 5 --finetune_epochs 10 \
--modality rgb --datadir data/rgbs/ --dataset two-class --logdir log \
--dense_sampling --fusion_point logits --unimodality_pretrained pretrain/kinetics-sounds-rgb-resnet-50.pth.tar \
--learnable_lf_weights --num_segments 5 --cost_weights 1.0 0.005 --causality_modeling lstm --gammas 10.0 --sync-bn \
--lr 0.001 --p_lr 0.01 --lr_scheduler multisteps --lr_steps 10 15

比较重要的参数:

  • --gpu_id 0 使用第0个gpu
  • --multiprocessing-distributed 多卡训练
  • --modality rgb使用rgb模态
  • --datadir data/rgbs/ rgb数据集的路径
  • --dataset data_config.py中的数据集的配置
  • --unimodality_pretrained 单模态的预训练权重
  • --warmup_epochs 预热训练数量
  • --finetune_epochs 策略网络训练数量
  • --epochs 训练总数量

训练多个模态

1
2
3
4
5
6
python3 train.py --gpu_id 0 --backbone_net adamml -d 50 \
--groups 8 --frames_per_group 4 -b 1 -j 3 --epochs 20 --warmup_epochs 5 --finetune_epochs 10 \
--modality rgb sound flow --datadir data/rgbs/ data/audios/ data/flows/ --dataset two-class --logdir log \
--dense_sampling --fusion_point logits --unimodality_pretrained  pretrain/kinetics-sounds-rgb-resnet-50.pth.tar pretrain/kinetics-sounds-audio-mobilenet_v2.pth.tar pretrain/kinetics-sounds-flow-resnet-50.pth.tar \
--learnable_lf_weights --num_segments 5 --cost_weights 0.5 0.05 0.8 --causality_modeling lstm --gammas 10.0 --sync-bn \
--lr 0.001 --p_lr 0.01 --lr_scheduler multisteps --lr_steps 10 15

评估模型

1
2
3
4
5
python3 train.py -e --gpu_id 0 --backbone_net adamml -d 50 \
--groups 8 --frames_per_group 4 -b 2 -j 3 \
--modality rgb --datadir data/rgbs --dataset two-class --logdir log \
--dense_sampling --fusion_point logits --pretrained log/last/checkpoint_warmup_01.pth.tar \
--learnable_lf_weights --num_segments 5 --causality_modeling lstm --sync-bn
updatedupdated2022-03-182022-03-18