Switch to side-by-side view

--- a
+++ b/tools/data/kinetics/download.py
@@ -0,0 +1,230 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/activitynet/ActivityNet/
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+import argparse
+import glob
+import json
+import os
+import shutil
+import ssl
+import subprocess
+import uuid
+from collections import OrderedDict
+
+import pandas as pd
+from joblib import Parallel, delayed
+
+ssl._create_default_https_context = ssl._create_unverified_context
+
+
+def create_video_folders(dataset, output_dir, tmp_dir):
+    """Creates a directory for each label name in the dataset."""
+    if 'label-name' not in dataset.columns:
+        this_dir = os.path.join(output_dir, 'test')
+        if not os.path.exists(this_dir):
+            os.makedirs(this_dir)
+        # I should return a dict but ...
+        return this_dir
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+
+    label_to_dir = {}
+    for label_name in dataset['label-name'].unique():
+        this_dir = os.path.join(output_dir, label_name)
+        if not os.path.exists(this_dir):
+            os.makedirs(this_dir)
+        label_to_dir[label_name] = this_dir
+    return label_to_dir
+
+
+def construct_video_filename(row, label_to_dir, trim_format='%06d'):
+    """Given a dataset row, this function constructs the output filename for a
+    given video."""
+    basename = '%s_%s_%s.mp4' % (row['video-id'],
+                                 trim_format % row['start-time'],
+                                 trim_format % row['end-time'])
+    if not isinstance(label_to_dir, dict):
+        dirname = label_to_dir
+    else:
+        dirname = label_to_dir[row['label-name']]
+    output_filename = os.path.join(dirname, basename)
+    return output_filename
+
+
+def download_clip(video_identifier,
+                  output_filename,
+                  start_time,
+                  end_time,
+                  tmp_dir='/tmp/kinetics/.tmp_dir',
+                  num_attempts=5,
+                  url_base='https://www.youtube.com/watch?v='):
+    """Download a video from youtube if exists and is not blocked.
+    arguments:
+    ---------
+    video_identifier: str
+        Unique YouTube video identifier (11 characters)
+    output_filename: str
+        File path where the video will be stored.
+    start_time: float
+        Indicates the beginning time in seconds from where the video
+        will be trimmed.
+    end_time: float
+        Indicates the ending time in seconds of the trimmed video.
+    """
+    # Defensive argument checking.
+    assert isinstance(video_identifier, str), 'video_identifier must be string'
+    assert isinstance(output_filename, str), 'output_filename must be string'
+    assert len(video_identifier) == 11, 'video_identifier must have length 11'
+
+    status = False
+    # Construct command line for getting the direct video link.
+    tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4())
+
+    if not os.path.exists(output_filename):
+        if not os.path.exists(tmp_filename):
+            command = [
+                'youtube-dl', '--quiet', '--no-warnings',
+                '--no-check-certificate', '-f', 'mp4', '-o',
+                '"%s"' % tmp_filename,
+                '"%s"' % (url_base + video_identifier)
+            ]
+            command = ' '.join(command)
+            print(command)
+            attempts = 0
+            while True:
+                try:
+                    subprocess.check_output(
+                        command, shell=True, stderr=subprocess.STDOUT)
+                except subprocess.CalledProcessError as err:
+                    attempts += 1
+                    if attempts == num_attempts:
+                        return status, err.output
+                else:
+                    break
+
+        tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
+        # Construct command to trim the videos (ffmpeg required).
+        command = [
+            'ffmpeg', '-i',
+            '"%s"' % tmp_filename, '-ss',
+            str(start_time), '-t',
+            str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
+            '-threads', '1', '-loglevel', 'panic',
+            '"%s"' % output_filename
+        ]
+        command = ' '.join(command)
+        try:
+            subprocess.check_output(
+                command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as err:
+            return status, err.output
+
+    # Check if the video was successfully saved.
+    status = os.path.exists(output_filename)
+    os.remove(tmp_filename)
+    return status, 'Downloaded'
+
+
+def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir):
+    """Wrapper for parallel processing purposes."""
+    output_filename = construct_video_filename(row, label_to_dir, trim_format)
+    clip_id = os.path.basename(output_filename).split('.mp4')[0]
+    if os.path.exists(output_filename):
+        status = tuple([clip_id, True, 'Exists'])
+        return status
+
+    downloaded, log = download_clip(
+        row['video-id'],
+        output_filename,
+        row['start-time'],
+        row['end-time'],
+        tmp_dir=tmp_dir)
+    status = tuple([clip_id, downloaded, log])
+    return status
+
+
+def parse_kinetics_annotations(input_csv, ignore_is_cc=False):
+    """Returns a parsed DataFrame.
+    arguments:
+    ---------
+    input_csv: str
+        Path to CSV file containing the following columns:
+          'YouTube Identifier,Start time,End time,Class label'
+    returns:
+    -------
+    dataset: DataFrame
+        Pandas with the following columns:
+            'video-id', 'start-time', 'end-time', 'label-name'
+    """
+    df = pd.read_csv(input_csv)
+    if 'youtube_id' in df.columns:
+        columns = OrderedDict([('youtube_id', 'video-id'),
+                               ('time_start', 'start-time'),
+                               ('time_end', 'end-time'),
+                               ('label', 'label-name')])
+        df.rename(columns=columns, inplace=True)
+        if ignore_is_cc:
+            df = df.loc[:, df.columns.tolist()[:-1]]
+    return df
+
+
+def main(input_csv,
+         output_dir,
+         trim_format='%06d',
+         num_jobs=24,
+         tmp_dir='/tmp/kinetics'):
+    tmp_dir = os.path.join(tmp_dir, '.tmp_dir')
+
+    # Reading and parsing Kinetics.
+    dataset = parse_kinetics_annotations(input_csv)
+
+    # Creates folders where videos will be saved later.
+    label_to_dir = create_video_folders(dataset, output_dir, tmp_dir)
+
+    # Download all clips.
+    if num_jobs == 1:
+        status_list = []
+        for _, row in dataset.iterrows():
+            status_list.append(
+                download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir))
+    else:
+        status_list = Parallel(
+            n_jobs=num_jobs)(delayed(download_clip_wrapper)(
+                row, label_to_dir, trim_format, tmp_dir)
+                             for i, row in dataset.iterrows())
+
+    # Clean tmp dir.
+    shutil.rmtree(tmp_dir)
+
+    # Save download report.
+    with open('download_report.json', 'w') as fobj:
+        fobj.write(json.dumps(status_list))
+
+
+if __name__ == '__main__':
+    description = 'Helper script for downloading and trimming kinetics videos.'
+    p = argparse.ArgumentParser(description=description)
+    p.add_argument(
+        'input_csv',
+        type=str,
+        help=('CSV file containing the following format: '
+              'YouTube Identifier,Start time,End time,Class label'))
+    p.add_argument(
+        'output_dir',
+        type=str,
+        help='Output directory where videos will be saved.')
+    p.add_argument(
+        '-f',
+        '--trim-format',
+        type=str,
+        default='%06d',
+        help=('This will be the format for the '
+              'filename of trimmed videos: '
+              'videoid_%0xd(start_time)_%0xd(end_time).mp4'))
+    p.add_argument('-n', '--num-jobs', type=int, default=24)
+    p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/kinetics')
+    # help='CSV file of the previous version of Kinetics.')
+    main(**vars(p.parse_args()))