label-errors/examples/audioset_preprocessing.py at main · cleanlab/label-errors · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# coding: utf-8

# Copyright (c) 2021-2022 Cleanlab Inc.
# This file is part of cleanlab/label-errors.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab/label-errors is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License

# This agreement applies to this version and all previous versions of
# cleanlab/label-errors.

"""
Preprocess the tfrecord AudioSet feature embeddings into numpy data files.

Resources used:
1. https://github.com/tensorflow/models/tree/master/research/audioset
2. https://research.google.com/audioset/download.html
3. https://github.com/audioset/ontology
"""


import argparse
import os
import numpy as np
import tensorflow as tf  # version 1.15.4
import multiprocessing
import tqdm
import pickle
from keras.preprocessing.sequence import pad_sequences

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument(
    '--audioset-dir', metavar='AUDIOSET_DIR',
    help='Specify path to ../audioset/audioset_v1_embeddings/',
)


def read_data(path, include_times=False):
    result = [[], [], []]
    if include_times:
        result += [[], []]
    for example in tf.python_io.tf_record_iterator(path):
        tf_example = tf.train.Example.FromString(example)
        vid_id = tf_example.features.feature['video_id'].bytes_list.value[
            0].decode(encoding='UTF-8')
        label = tf_example.features.feature['labels'].int64_list.value
        if include_times:
            result[3].append(tf_example.features.feature[
                                 'start_time_seconds'].float_list.value)
            result[4].append(tf_example.features.feature[
                                 'end_time_seconds'].float_list.value)
        tf_seq_example = tf.train.SequenceExample.FromString(example)
        tf_feature = tf_seq_example.feature_lists.feature_list[
            'audio_embedding'].feature
        n_frames = len(tf_feature)
        audio_frames = []
        # Iterate through frames.
        for i in range(n_frames):
            hexembed = tf_feature[i].bytes_list.value[0].hex()
            arrayembed = [int(hexembed[i:i + 2], 16) for i in
                          range(0, len(hexembed), 2)]
            audio_frames.append(arrayembed)
        result[0].append(vid_id)
        result[1].append(list(label))
        result[2].append(np.stack(audio_frames).astype(np.uint8))
    return result


def pad(feature_matrix, maxlen=10):
    return pad_sequences(feature_matrix.T, maxlen=maxlen).T.astype(np.uint8)


def preprocess_data(path, prefix='bal_train'):
    fns = [path + fn for fn in os.listdir(path)]
    with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
        results = list(tqdm.tqdm(p.imap(read_data, fns), total=len(fns)))

    print('\nAll files read in. Now post-processing.')
    video_ids = [v for r in results for v in r[0]]
    labels = [l for r in results for l in r[1]]
    features = [f for r in results for f in r[2]]
    del results  # Free memory
    # Make all inputs exactly the same shape.
    print("Padding with 0 to make all features shape (10,128) of type uint8.")
    with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
        features = list(tqdm.tqdm(p.imap(pad, features), total=len(features)))

    print('Saving pickled results.')
    with open(prefix + '_features.p', 'wb') as wf:
        pickle.dump(features, wf, pickle.HIGHEST_PROTOCOL)
    with open(prefix + '_video_ids.p', 'wb') as wf:
        pickle.dump(video_ids, wf, pickle.HIGHEST_PROTOCOL)
    with open(prefix + '_labels.p', 'wb') as wf:
        pickle.dump(labels, wf, pickle.HIGHEST_PROTOCOL)

    print('Preprocessing complete.')


def main(audioset_dir):
    for kind in ["eval", "bal_train", "unbal_train"]:
        preprocess_data(audioset_dir + kind + "/", prefix=kind)


if __name__ == '__main__':
    arg_parser = parser.parse_args()
    if arg_parser.audioset_dir is None:
        parser.error("Specify the path to the audioset embeddings "
                     "directory.\nFor example, if the data is stored in "
                     "'/datasets/audioset/audioset_v1_embeddings/' "
                     "you should call this script like this:\npython "
                     "audioset_preprocessing.py --audioset-dir "
                     "'/datasets/audioset/audioset_v1_embeddings/'")
    main(audioset_dir=arg_parser.audioset_dir)