--- a
+++ b/rocaseg/datasets/prepare_dataset_okoa.py
@@ -0,0 +1,204 @@
+import os
+from collections import defaultdict
+import click
+
+from joblib import Parallel, delayed
+from glob import glob
+from tqdm import tqdm
+
+import numpy as np
+import pandas as pd
+
+import pydicom
+import cv2
+
+
+cv2.ocl.setUseOpenCL(False)
+
+
+def read_dicom(fname, only_data=False):
+    data = pydicom.read_file(fname)
+    if len(data.PixelData) == 131072:
+        dtype = np.uint16
+    else:
+        dtype = np.uint8
+    image = np.frombuffer(data.PixelData, dtype=dtype).astype(float)
+
+    if data.PhotometricInterpretation == 'MONOCHROME1':
+        image = image.max() - image
+
+    image = image.reshape((data.Rows, data.Columns))
+
+    if only_data:
+        return image
+    else:
+        if hasattr(data, 'ImagerPixelSpacing'):
+            spacing = [float(e) for e in data.ImagerPixelSpacing]
+            slice_thickness = float(data.SliceThickness)
+        elif hasattr(data, 'PixelSpacing'):
+            spacing = [float(e) for e in data.PixelSpacing]
+            slice_thickness = float(data.SliceThickness)
+        else:
+            msg = f'DICOM {fname} does not have the required attributes'
+            print(msg)
+            spacing = (0.0, 0.0)
+            slice_thickness = 0.0
+        return image, spacing[0], spacing[1], slice_thickness
+
+
+@click.command()
+@click.argument('path_root_okoa')
+@click.argument('path_root_output')
+@click.option('--num_threads', default=12, type=click.IntRange(0, 16))
+@click.option('--margin', default=0, type=int)
+@click.option('--meta_only', is_flag=True)
+def main(**config):
+    config['path_root_okoa'] = os.path.abspath(config['path_root_okoa'])
+    config['path_root_output'] = os.path.abspath(config['path_root_output'])
+
+    # -------------------------------------------------------------------------
+    def worker_s5g(path_root_output, row, margin):
+        meta = defaultdict(list)
+
+        patient = row['patient'].values[0]
+        slice_idx = row['slice_idx'].values[0]
+        side = row['side'].values[0]
+        release = 'initial'
+        sequence = 't2_de3d_we_sag_iso'
+
+        image, *dicom_meta = read_dicom(row[('fname_full', 'Images')])
+
+        # Set the default values for the voxel spacings
+        pixel_spacing_0 = dicom_meta[0] or '0.5859375'
+        pixel_spacing_1 = dicom_meta[1] or '0.5859375'
+        slice_thickness = dicom_meta[2] or '0.60000002384186'
+
+        mask_femur = read_dicom(row[('fname_full', 'Femur')], only_data=True)
+        mask_tibia = read_dicom(row[('fname_full', 'Tibia')], only_data=True)
+        mask_full = np.zeros(mask_femur.shape, dtype=np.uint8)
+        # Use inverse order to prioritize femoral tissues in collision handling
+        mask_full[mask_tibia > 0] = 2
+        mask_full[mask_femur > 0] = 1
+
+        if margin:
+            image = image[margin:-margin, margin:-margin]
+            mask_full = mask_full[margin:-margin, margin:-margin]
+
+        fname_pattern = '{slice_idx:>03}.{ext}'
+
+        # Save image and mask data
+        dir_rel_image = os.path.join(patient, release, sequence, 'images')
+        dir_rel_mask = os.path.join(patient, release, sequence, 'masks')
+        dir_abs_image = os.path.join(path_root_output, dir_rel_image)
+        dir_abs_mask = os.path.join(path_root_output, dir_rel_mask)
+        for d in (dir_abs_image, dir_abs_mask):
+            if not os.path.exists(d):
+                os.makedirs(d)
+
+        fname_image = fname_pattern.format(slice_idx=slice_idx, ext='png')
+        path_abs_image = os.path.join(dir_abs_image, fname_image)
+        if not config['meta_only']:
+            cv2.imwrite(path_abs_image, image)
+
+        fname_mask = fname_pattern.format(slice_idx=slice_idx, ext='png')
+        path_abs_mask = os.path.join(dir_abs_mask, fname_mask)
+        if not config['meta_only']:
+            cv2.imwrite(path_abs_mask, mask_full)
+
+        path_rel_image = os.path.join(dir_rel_image, fname_image)
+        path_rel_mask = os.path.join(dir_rel_mask, fname_mask)
+
+        meta['subset'].append(row['subset'].values[0])
+        meta['patient'].append(patient)
+        meta['release'].append(release)
+        meta['sequence'].append(sequence)
+        meta['side'].append(side)
+        meta['KL'].append(row['KL'].values[0])
+        meta['slice_idx'].append(slice_idx)
+        meta['pixel_spacing_0'].append(pixel_spacing_0)
+        meta['pixel_spacing_1'].append(pixel_spacing_1)
+        meta['slice_thickness'].append(slice_thickness)
+        meta['path_rel_image'].append(path_rel_image)
+        meta['path_rel_mask'].append(path_rel_mask)
+        return meta
+
+    # -------------------------------------------------------------------------
+
+    # Get list of images files
+    paths_fnames_dicom = glob(os.path.join(config['path_root_okoa'], '**', '*.IMA'),
+                              recursive=True)
+
+    # root / training|evaluation / P36 / Images|Femur|Tibia / (1-160).IMA
+    def meta_from_fname(fn):
+        tmp = fn.split('/')
+        slice_idx = int(os.path.splitext(tmp[-1])[0]) - 1
+        slice_idx = '{:>03}'.format(slice_idx)
+        meta = {
+            'fname_full': fn,
+            'slice_idx': slice_idx,
+            'kind': tmp[-2],
+            'patient': tmp[-3],
+            'subset': tmp[-4]
+        }
+        return meta
+
+    dict_meta = {
+        'fname_full': [],
+        'slice_idx': [],
+        'kind': [],
+        'patient': [],
+        'subset': []
+    }
+
+    for e in paths_fnames_dicom:
+        tmp_meta = meta_from_fname(e)
+        for k, v in tmp_meta.items():
+            dict_meta[k].append(v)
+
+    df_meta = pd.DataFrame.from_dict(dict_meta)
+    df_meta = (df_meta
+               .set_index(['subset', 'patient', 'slice_idx', 'kind'])
+               .unstack('kind')
+               .reset_index())
+
+    # Add info on scan side and KL grades
+    path_file_side = os.path.join(config['path_root_okoa'], 'sides.csv')
+    df_side = pd.read_csv(path_file_side)
+    path_file_kl = os.path.join(config['path_root_okoa'], 'KL_grades.csv')
+    df_kl = pd.read_csv(path_file_kl)
+
+    df_extra = pd.merge(df_side, df_kl, on='patient', how='left', sort=True)
+    df_extra.loc[:, 'KL'] = -1
+    for r_idx, r in df_extra.iterrows():
+        if r['side'] == 'LEFT':
+            df_extra.loc[r_idx, 'KL'] = r['KL left']
+        elif r['side'] == 'RIGHT':
+            df_extra.loc[r_idx, 'KL'] = r['KL right']
+
+    # Keep only the fields of interest
+    df_extra = df_extra.loc[:, ['patient', 'side', 'KL', 'age']]
+
+    # Make same multi-index such than pd.merge doesn't create extra column
+    df_extra.columns = pd.MultiIndex.from_tuples([(c, '') for c in df_extra.columns])
+
+    # Merge the metadata into the single df
+    df_meta = pd.merge(df_meta, df_extra, on='patient', how='left', sort=True)
+
+    # Process the raw data
+    metas = Parallel(config['num_threads'])(delayed(worker_s5g)(
+        *[config['path_root_output'], row, config['margin']]
+    ) for _, row in tqdm(df_meta.iterrows(), total=len(df_meta)))
+
+    # Merge meta information from different stacks
+    tmp = defaultdict(list)
+    for d in metas:
+        for k, v in d.items():
+            tmp[k].extend(v)
+    df_out = pd.DataFrame.from_dict(tmp)
+
+    path_output_meta = os.path.join(config['path_root_output'], 'meta_base.csv')
+    df_out.to_csv(path_output_meta, index=False)
+
+
+if __name__ == '__main__':
+    main()