slideslicer / Git / [5a7589] /slideslicer/sample_from

Models:
DanielG/
slideslicer
Downloads: 1
[5a7589]: / slideslicer / sample_from_slide.py
History
Download this file
459 lines (389 with data), 16.1 kB

# coding: utf-8

from PIL import Image
import numpy as np
from collections import Counter
import pandas as pd
import os
import re
import json
from warnings import warn

import openslide
import cv2
from pycocotools.mask import encode, decode

from slideslicer.extract_rois_svs_xml import extract_rois_svs_xml
from slideslicer.slideutils import (plot_contour, get_median_color, 
                        get_thumbnail_magnification,
                        get_img_bbox, get_rotated_highres_roi,
                        get_uniform_tiles, 
                        get_threshold_tissue_mask, 
                        convert_contour2mask,
                        convert_mask2contour,
                        CropRotateRoi,
                        get_contour_centre, read_roi_patches_from_slide,
                        clip_roi_wi_bbox, sample_points)


def get_img_id(svsname):
    imgid = re.sub("\.svs$","", 
                   os.path.basename(svsname)
                   ).replace(" ", "_").replace("-","_")
    return imgid

def get_prefix(imgid, pos, name, tissueid, id, parentdir = "data", suffix=''):
    prefix = "{parentdir}/{typ}/{imgid}-{pos}-t{tissue}-r{roiid}-{typ}{suffix}".format(**{
                                        "tissue":tissueid,
                                        "pos": "x{}-y{}".format(*pos),
                                        "parentdir":parentdir,
                                        "imgid":imgid,
                                        "roiid":id,
                                        "typ": (name.replace(" ","_")),
                                        "suffix":suffix,
                                        })
    return prefix


def summarize_rois_wi_patch(rois, bg_names = ["tissue"], frac_thr=16):
    names = []
    areas = []
    ids = []
    
    tissue_info = []
    for rr in rois:
        if rr['name'] in bg_names:
            tissue_info.append(rr)
            continue
        names.append(rr['name'])
        areas.append(rr['area'])
        ids.append(rr['id'])
#     assert (len(tissue_info)==1)
    tissue_id = "+".join(sorted(["%s"%tt['id'] for tt in tissue_info]))
    dfareas = (pd.DataFrame(dict(area=areas, name=names, id=ids))
                     .sort_values("area", ascending=False)
               )
    areasum = (dfareas.groupby('name')
                     .agg({"area":sum, "id": "first"})
                     .sort_values("area", ascending=False)
              )
    if len(areasum) == 0:
        return {'name':'blank', 
            "id": tissue_id,
            "tissue_id": tissue_id,
            "stats": dfareas.to_dict(orient='records')
            }
    elif len(areasum)==1:
        name = areasum.index[0]
        id = areasum["id"][0]
    elif areasum["area"][0]/areasum["area"][1] > frac_thr:
        name = areasum.index[0]
        id = areasum["id"][0]
    else:
        name = '+'.join(areasum.index.tolist())
        id = '+'.join(sorted(areasum["id"].astype(str).tolist()))
    return {"name":name,
            "id": str(id),
            "tissue_id": tissue_id,
            "stats": dfareas.to_dict(orient='records')}


# Rewrite for generator if needed:
def visualise_chunks_and_rois(img_arr, roi_cropped_list,
                              nrows = 5, figsize=(15,15)
                             ):
    fig, axs = plt.subplots(nrows,len(img_arr)//nrows, figsize=figsize)
    for ax, reg, rois in zip(axs.ravel(), img_arr, roi_cropped_list):
        ax.imshow(reg)
        for rr in rois:
            if rr['name'] == 'tissue':
                continue
            plot_contour(rr["vertices"], ax=ax)
        xlab = "\n".join(["{}: {}".format(rr['id'], rr['name']) \
                          for rr in rois if rr['name'] !='tissue'])
        ax.set_xlabel(xlab)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        

def get_tissue_rois(slide,
                    roilist,
                    vis = False,
                    step = 1024,
                    magnlevel = 0,
                    target_size = None,
                    maxarea = 1e7,
                    random=False,
                    normal_only=True,
                    shift_factor = 2, 
                   ):

    print("NORMAL_ONLY", normal_only)
    if target_size is None:
        target_size = [step]*2

    tissue_rois = [roi for roi in roilist if roi['name']=='tissue']

    for roi in tissue_rois:
        print("tissue roi, id", roi["id"])
        cont = roi["vertices"]
        points = sample_points(cont,
                              spacing = step,
                              shift = -step//shift_factor,
                              mode = 'random' if random else 'grid')

        print("roi {} #{}:\t{:d} points sampled".format(roi["name"], roi["id"],len(points), ))
        pointroilist = [{"vertices":[pp], "area":0} for pp in points]
        
#         img_arr, roi_cropped_list, msk_arr, = \
        imgroiiter = read_roi_patches_from_slide(slide, 
                                        pointroilist,
                                        but_list = roilist,
                                        target_size = target_size,
                                        magnlevel = magnlevel,
                                        maxarea = maxarea,
                                        color=1,
                                        nchannels=3,
                                        allcomponents = True,
                                        nomask=True,
                                       )
#         if vis:
#             plt.scatter(points[:,0], points[:,1],c='r')
#             plot_contour(cont)
        # filter for rois with only normal tissue 
        def filter_(x):
            return all(roi['name']=='tissue' for roi in x[1])
        if normal_only:
            imgroiiter = filter(filter_, imgroiiter)
        yield imgroiiter


def save_tissue_chunks(imgroiiter, imgid, parentdir="data",
                       lower = [0, 0, 180],
                       upper = [179, 10, 255],
                       close=50,
                       open_=30,
                       filtersize = 20,
                       frac_thr=16,
                       ):
    for ii, (reg, rois, _, start_xy) in enumerate(imgroiiter):
        sumdict = summarize_rois_wi_patch(rois, bg_names = [], frac_thr=frac_thr)
        prefix = get_prefix(imgid, start_xy, sumdict["name"], sumdict["id"], ii,
                            parentdir=parentdir,)

        #fn_summary_json = prefix + "-summary.json"
        fn_json = prefix + ".json"
        fnoutpng = prefix + '.png'
        print(fnoutpng)

        os.makedirs(os.path.dirname(fn_json), exist_ok=True)
        #with open(fn_summary_json, 'w+') as fhj: json.dump(sumdict, fhj)
        if isinstance(reg, Image.Image):
            reg.save(fnoutpng)
        else:
            Image.fromarray(reg).save(fnoutpng)

        rois = add_roi_bytes(rois, np.asarray(reg),
                lower=lower, upper=upper,
                open=open_, close=close,
                filtersize=filtersize)
        with open(fn_json, 'w+') as fhj: json.dump(rois, fhj)


def add_roi_bytes(rois, reg,
                  lower = [0, 0, 180],
                  upper = [179, 25, 255],
                  filtersize=25,
                  close=True,
                  open=False,
                  minlen = -1):
    if minlen==-1:
        minlen=filtersize
    rois = rois.copy()
    tissue_roi = None
    other_mask_ = 0
    
    print('ROIS:', *[roi_['name'] for roi_ in rois])
    for roi_ in rois:
        if roi_["name"] == "tissue":
            tissue_roi = roi_
            continue
        mask_ = convert_contour2mask(roi_["vertices"], 
                                     reg.shape[1], reg.shape[0],
                                     fill=1, order='F')

        cocomask = encode(np.asarray(mask_, dtype='uint8'))
        cocomask["counts"] = cocomask["counts"].decode('utf-8')
        roi_.update(cocomask)
        if isinstance(roi_["vertices"], np.ndarray):
            roi_["vertices"] = roi_["vertices"].tolist()
        other_mask_ = np.maximum(other_mask_, mask_)
    
    roi_ = tissue_roi
    if roi_ is None:
        warn("Someting strange is going on. Make sure no tissue chunks are missing")
        roi_ = {'vertices': []}
    #print('tissue roi', roi_)
    if reg is not None:
        mask_ = get_threshold_tissue_mask(reg, color=True,
                                filtersize=filtersize,
                                dtype=bool,
                                open=open, close=close,
                                lower = lower, upper = upper)
        if mask_.sum()==0:
            roi_["vertices"]= []
            print("skipping empty mask", roi_['name'], roi_['id'])
        verts = convert_mask2contour(mask_.astype('uint8'), minlen=minlen)
        # print("verts", len(verts))
        if len(verts)>0:
            #print('vertices', verts[np.argmax(map(len,verts))])
            roi_["vertices"] = verts[np.argmax(map(len,verts))]
        else:
            #print("verts", len(verts), roi_["vertices"])
            pass
        mask_ = np.asarray(mask_, order='F')
    else:
        mask_ = convert_contour2mask(roi_["vertices"], reg.shape[1], reg.shape[0], 
                             fill=1, order='F')
        if mask_.sum()==0:
            roi_["vertices"]= []
            #continue

    if isinstance(other_mask_, np.ndarray):
        mask_ = mask_.astype(bool) & ~other_mask_.astype(bool)
    cocomask = encode(np.asarray(mask_, dtype='uint8'))
    cocomask["counts"] = cocomask["counts"].decode('utf-8')
    roi_.update(cocomask)
    if isinstance(roi_["vertices"], np.ndarray):
        roi_["vertices"] = roi_["vertices"].tolist()   
    rois = [rr for rr in rois if len(rr['vertices'])>0]
    return rois


if __name__ == '__main__':
    import sys
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
      '--data-root',
      type=str,
      default='../data',
      help='The directory where the input data will be stored.')

    parser.add_argument(
      '--json-dir',
      type=str,
      default='../data/roi-json',
      help='The directory where the roi JSON files will be stored.')

    parser.add_argument(
      '--keep-empty',
      action='store_true',
      default=False,
      help='keep empty tissue chunks (with no annotations within)')

    parser.add_argument(
      '--target-side',
      type=int,
      default=1024,
      help='The directory where the input data will be stored.')

    parser.add_argument(
      '--max-area',
      type=float,
      default=1e7,
      help='maximal area of a roi')

    parser.add_argument(
      '--fnxml',
      dest='fnxml',
      type=str,
      help='The XML files for ROI.')

    parser.add_argument(
      '--all-grid',
      action='store_true',
      default=False,
      help='store all grid patches (by defaut grid patches that overlap features will be removed)')

    parser.add_argument(
      '--target-sampling',
      action='store_true',
      default=False,
      help='store only grid patches')

    parser.add_argument(
      '--keep-levels',
      type=int,
      default=3,
      help='.')

    parser.add_argument(
      '--magnlevel',
      type=int,
      default=0,
      help='.')

    parser.add_argument(
      '--frac-stride',
      type=int,
      default=1,
      help='.')

    prms = parser.parse_args()
    VISUALIZE = False

    lower = [0, 0, 180]
    upper = [179, 10, 255]
    close=50
    open_=30
    filtersize = 20

    fnsvs = re.sub(".xml$", ".svs", prms.fnxml)

    outdir = os.path.join(prms.data_root, "data_{}/fullsplit".format(prms.target_side))

    ## setup
    imgid = get_img_id(fnsvs)

    target_size = [prms.target_side, prms.target_side,]
    #os.makedirs(outdir)

    # ## Read XML ROI, convert, and save as JSON
    fnjson = extract_rois_svs_xml(prms.fnxml, outdir=prms.json_dir,
                                  remove_empty = ~prms.keep_empty,
                                  keeplevels=prms.keep_levels)

    with open(fnjson,'r') as fh:
        roilist = json.load(fh)

    print("ROI type counts")
    print(pd.Series([roi["name"] for roi in roilist]).value_counts())

    # read slide
    slide = openslide.OpenSlide(fnsvs)

    # load the thumbnail image
    img = np.asarray(slide.associated_images["thumbnail"])

    median_color = get_median_color(slide)
    ratio = get_thumbnail_magnification(slide)

    print("full scale slide dimensions: w={}, h={}".format(*slide.dimensions))

    if VISUALIZE:
        from matplotlib import pyplot as plt
        colordict = {'open glom': 'b',
                     'scler glom': 'm',
                     'infl':'r',
                     'tissue':'w',
                     'other tissue':'y',
                     'art':'olive',
                     'fold':'y'}

        #cell#

        plt.figure(figsize = (18,10))
        plt.imshow(img)
        for roi in roilist:
            plot_contour(roi["vertices"]/ratio, c=colordict[roi['name']])

        #cell#
        vert = roilist[19]["vertices"]
        target_size = [1024]*2
        x,y,w,h = cv2.boundingRect(np.asarray(vert).round().astype(int))
        mask, cropped_vertices = get_region_mask(vert, [x,y], (w,h), color=(255,))

        plt.imshow(mask)
        plot_contour(cropped_vertices, c='r')
        print(mask.max())

    #############################
    if prms.target_sampling:
        print("READING TARGETED ROIS", file=sys.stderr)

        imgroiiter = read_roi_patches_from_slide(slide, roilist,
                                target_size = target_size,
                                maxarea = prms.max_area,
                                nchannels=3,
                                allcomponents=True,
                               )

        print("READING AND SAVING SMALLER ROIS (GLOMERULI, INFLAMMATION LOCI ETC.)",
              file=sys.stderr) 

        for reg, rois,_, start_xy in imgroiiter:
            sumdict = summarize_rois_wi_patch(rois, bg_names = ["tissue"], frac_thr=16)
            prefix = get_prefix(imgid, start_xy, sumdict["name"], sumdict["tissue_id"],
                                sumdict["id"], parentdir=outdir, suffix='-targeted')
            #fn_summary_json = prefix + "-summary.json"
            fn_json = prefix + ".json"
            fnoutpng = prefix + '.png'
            print(fnoutpng)
            os.makedirs(os.path.dirname(fn_json), exist_ok=True)
            
            #with open(fn_summary_json, 'w+') as fhj: json.dump(sumdict, fhj)
            if isinstance(reg, Image.Image):
                reg.save(fnoutpng)
            else:
                Image.fromarray(reg).save(fnoutpng)
            
            rois = add_roi_bytes(rois, reg, lower=lower, upper=upper,
                                 close=close,
                                 open=open_,
                                 filtersize = filtersize)
            with open(fn_json, 'w+') as fhj: json.dump( rois, fhj)

    print("READING AND SAVING _FEATURELESS_ / NORMAL TISSUE", file=sys.stderr)

    magnification = 4**prms.magnlevel
    real_side = prms.target_side * magnification

    for tissue_chunk_iter in get_tissue_rois(slide,
                                            roilist,
                                            vis = False,
                                            step = real_side // prms.frac_stride,
                                            target_size = [real_side]*2,
                                            maxarea = 1e7,
                                            random=False,
                                            normal_only = not prms.all_grid,
                                           ):
            # save
            print('saving tissue chunk')
            save_tissue_chunks(tissue_chunk_iter, imgid, parentdir=outdir,
                               close=close,
                               open_=open_,
                               frac_thr=16,
                               filtersize = filtersize)