MWAS-Biomarkers / Git / [793d90] /lib/gcforest/fgnet.py

Models:
DanielG/
MWAS-Biomarkers
Downloads: 1
[793d90]: / lib / gcforest / fgnet.py
History
Download this file
140 lines (122 with data), 6.6 kB

# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import numpy as np
import os, os.path as osp
import json

from .layers import get_layer
from .utils.log_utils import get_logger

LOGGER = get_logger("gcforest.gcnet")

class FGNet(object):
    """
    GCForest : FineGrained Components
    """
    def __init__(self, net_config, data_cache):
        #net_config_str = json.dumps(net_config, sort_keys=True, indent=4, separators=(',', ':'))
        #LOGGER.info("\n" + net_config_str)
        self.data_cache = data_cache
        self.inputs = net_config.get("inputs", [])
        self.check_net_config(net_config)
        self.outputs = net_config.get("outputs", [])

        # layers
        self.layers = []
        self.name2layer = {}
        model_disk_base = net_config.get("model_cache", {}).get("disk_base", None)
        for layer_config in net_config["layers"]:
            layer = get_layer(layer_config, self.data_cache)
            layer.model_disk_base = model_disk_base
            self.layers.append(layer)
            self.name2layer[layer.name] = layer


    def fit_transform(self, X_train, y_train, X_test, y_test, train_config):
        """
        delete_layer (bool): defalut=False
            When X_test is not None and there is no need to run test, delete layer in realtime to save mem
             
        """
        LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
            X_train.shape, y_train.shape, None if X_test is None else X_test.shape, None if y_test is None else y_test.shape))
        self.data_cache.reset("train", X_train, y_train)
        if "test" in train_config.phases:
            self.data_cache.reset("test", X_test, y_test)
        for li, layer in enumerate(self.layers):
            layer.fit_transform(train_config)

    @staticmethod
    def concat_datas(datas):
        if type(datas) != list:
            return datas
        for i, data in enumerate(datas):
            datas[i] = data.reshape((data.shape[0], -1))
        return np.concatenate(datas, axis=1)

    def transform(self, X_test):
        LOGGER.info("X_test.shape={}".format(X_test.shape))
        self.data_cache.reset("test", X_test, None)
        for li, layer in enumerate(self.layers):
            layer.transform()
        return self.get_outputs("test")

    def score(self):
        for li, layer in enumerate(self.layers):
            layer.score()

    def get_outputs(self, phase):
        outputs = self.data_cache.gets(phase, self.outputs)
        return outputs

    def save_outputs(self, phase, save_y=True, save_path=None):
        if save_path is None:
            if self.data_cache.cache_dir is None:
                LOGGER.error("save path is None and data_cache.cache_dir is None!!! don't know where to save")
                return
            save_path = osp.join(self.data_cache.cache_dir, phase, "outputs.pkl")
        import pickle
        info  = ""
        data_names = [name for name in self.outputs]
        if save_y:
            data_names.append("y")
        datas = {}
        for di, data_name in enumerate(data_names):
            datas[data_name] = self.data_cache.get(phase, data_name)
            info = "{},{}->{}".format(info, data_name, datas[data_name].shape)
        LOGGER.info("outputs.shape={}".format(info))
        LOGGER.info("Saving Outputs in {} ".format(save_path))
        with open(save_path, "wb") as f:
            pickle.dump(datas, f, pickle.HIGHEST_PROTOCOL)

    def check_net_config(self, net_config):
        """
        check net_config 
        """
         
        top2layer = {}
        name2layer = {}
        for li, layer_config in enumerate(net_config["layers"]):
            layer_name = layer_config["name"]
            if layer_name in name2layer:
                raise ValueError("layer name duplicate. layer_name={}, config1={}, config2={}".format(
                    layer_name, name2layer[layer_name], layer_config))
            name2layer[layer_name] = layer_config

            for bottom in layer_config["bottoms"]:
                if bottom != "X" and bottom != "y" and not bottom in self.inputs and not bottom in top2layer:
                    raise ValueError("li={}, layer_config={}, bottom({}) doesn't be produced by other layers".format(
                        li, layer_config, bottom))
            for top in layer_config["tops"]:
                if top in top2layer:
                    raise ValueError("top duplicate. layer({}) and layer({}) have same top blob: {}".format(
                        top2layer[top], layer_config["name"], top))
                top2layer[top] = layer_config["name"]
         
        outputs = net_config.get("outputs", [])
        if len(outputs) == 0:
            LOGGER.warn("outputs list is empty!!!")
        for output in outputs:
            if output == "X" or output == "y" or output in self.inputs or output in top2layer:
                continue
            raise ValueError("output data name not exist: output={}".format(output))
         
        for layer_config in net_config["layers"]:
            if len(layer_config["tops"]) > 1:
                for top_name in layer_config["tops"]:
                    if not top_name.startswith(layer_config["name"]):
                        LOGGER.warn("top_name is suggested to startswith layer_name: layer_config={}".format(layer_config))
            else:
                top = layer_config["tops"][0]
                if top != layer_config["name"]:
                    LOGGER.warn("layer_name != top_name, You should check to make sure this is what you want!!! layer_config={}".format(layer_config))