--- a +++ b/tools/data/kinetics/download.py @@ -0,0 +1,230 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/activitynet/ActivityNet/ +# Original licence: Copyright (c) Microsoft, under the MIT License. +# ------------------------------------------------------------------------------ +import argparse +import glob +import json +import os +import shutil +import ssl +import subprocess +import uuid +from collections import OrderedDict + +import pandas as pd +from joblib import Parallel, delayed + +ssl._create_default_https_context = ssl._create_unverified_context + + +def create_video_folders(dataset, output_dir, tmp_dir): + """Creates a directory for each label name in the dataset.""" + if 'label-name' not in dataset.columns: + this_dir = os.path.join(output_dir, 'test') + if not os.path.exists(this_dir): + os.makedirs(this_dir) + # I should return a dict but ... + return this_dir + if not os.path.exists(output_dir): + os.makedirs(output_dir) + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + + label_to_dir = {} + for label_name in dataset['label-name'].unique(): + this_dir = os.path.join(output_dir, label_name) + if not os.path.exists(this_dir): + os.makedirs(this_dir) + label_to_dir[label_name] = this_dir + return label_to_dir + + +def construct_video_filename(row, label_to_dir, trim_format='%06d'): + """Given a dataset row, this function constructs the output filename for a + given video.""" + basename = '%s_%s_%s.mp4' % (row['video-id'], + trim_format % row['start-time'], + trim_format % row['end-time']) + if not isinstance(label_to_dir, dict): + dirname = label_to_dir + else: + dirname = label_to_dir[row['label-name']] + output_filename = os.path.join(dirname, basename) + return output_filename + + +def download_clip(video_identifier, + output_filename, + start_time, + end_time, + tmp_dir='/tmp/kinetics/.tmp_dir', + num_attempts=5, + url_base='https://www.youtube.com/watch?v='): + """Download a video from youtube if exists and is not blocked. + arguments: + --------- + video_identifier: str + Unique YouTube video identifier (11 characters) + output_filename: str + File path where the video will be stored. + start_time: float + Indicates the beginning time in seconds from where the video + will be trimmed. + end_time: float + Indicates the ending time in seconds of the trimmed video. + """ + # Defensive argument checking. + assert isinstance(video_identifier, str), 'video_identifier must be string' + assert isinstance(output_filename, str), 'output_filename must be string' + assert len(video_identifier) == 11, 'video_identifier must have length 11' + + status = False + # Construct command line for getting the direct video link. + tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4()) + + if not os.path.exists(output_filename): + if not os.path.exists(tmp_filename): + command = [ + 'youtube-dl', '--quiet', '--no-warnings', + '--no-check-certificate', '-f', 'mp4', '-o', + '"%s"' % tmp_filename, + '"%s"' % (url_base + video_identifier) + ] + command = ' '.join(command) + print(command) + attempts = 0 + while True: + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as err: + attempts += 1 + if attempts == num_attempts: + return status, err.output + else: + break + + tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0] + # Construct command to trim the videos (ffmpeg required). + command = [ + 'ffmpeg', '-i', + '"%s"' % tmp_filename, '-ss', + str(start_time), '-t', + str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy', + '-threads', '1', '-loglevel', 'panic', + '"%s"' % output_filename + ] + command = ' '.join(command) + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as err: + return status, err.output + + # Check if the video was successfully saved. + status = os.path.exists(output_filename) + os.remove(tmp_filename) + return status, 'Downloaded' + + +def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir): + """Wrapper for parallel processing purposes.""" + output_filename = construct_video_filename(row, label_to_dir, trim_format) + clip_id = os.path.basename(output_filename).split('.mp4')[0] + if os.path.exists(output_filename): + status = tuple([clip_id, True, 'Exists']) + return status + + downloaded, log = download_clip( + row['video-id'], + output_filename, + row['start-time'], + row['end-time'], + tmp_dir=tmp_dir) + status = tuple([clip_id, downloaded, log]) + return status + + +def parse_kinetics_annotations(input_csv, ignore_is_cc=False): + """Returns a parsed DataFrame. + arguments: + --------- + input_csv: str + Path to CSV file containing the following columns: + 'YouTube Identifier,Start time,End time,Class label' + returns: + ------- + dataset: DataFrame + Pandas with the following columns: + 'video-id', 'start-time', 'end-time', 'label-name' + """ + df = pd.read_csv(input_csv) + if 'youtube_id' in df.columns: + columns = OrderedDict([('youtube_id', 'video-id'), + ('time_start', 'start-time'), + ('time_end', 'end-time'), + ('label', 'label-name')]) + df.rename(columns=columns, inplace=True) + if ignore_is_cc: + df = df.loc[:, df.columns.tolist()[:-1]] + return df + + +def main(input_csv, + output_dir, + trim_format='%06d', + num_jobs=24, + tmp_dir='/tmp/kinetics'): + tmp_dir = os.path.join(tmp_dir, '.tmp_dir') + + # Reading and parsing Kinetics. + dataset = parse_kinetics_annotations(input_csv) + + # Creates folders where videos will be saved later. + label_to_dir = create_video_folders(dataset, output_dir, tmp_dir) + + # Download all clips. + if num_jobs == 1: + status_list = [] + for _, row in dataset.iterrows(): + status_list.append( + download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir)) + else: + status_list = Parallel( + n_jobs=num_jobs)(delayed(download_clip_wrapper)( + row, label_to_dir, trim_format, tmp_dir) + for i, row in dataset.iterrows()) + + # Clean tmp dir. + shutil.rmtree(tmp_dir) + + # Save download report. + with open('download_report.json', 'w') as fobj: + fobj.write(json.dumps(status_list)) + + +if __name__ == '__main__': + description = 'Helper script for downloading and trimming kinetics videos.' + p = argparse.ArgumentParser(description=description) + p.add_argument( + 'input_csv', + type=str, + help=('CSV file containing the following format: ' + 'YouTube Identifier,Start time,End time,Class label')) + p.add_argument( + 'output_dir', + type=str, + help='Output directory where videos will be saved.') + p.add_argument( + '-f', + '--trim-format', + type=str, + default='%06d', + help=('This will be the format for the ' + 'filename of trimmed videos: ' + 'videoid_%0xd(start_time)_%0xd(end_time).mp4')) + p.add_argument('-n', '--num-jobs', type=int, default=24) + p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/kinetics') + # help='CSV file of the previous version of Kinetics.') + main(**vars(p.parse_args()))