BioDiscML / Git / [ce076b] /src/biodiscml/AdaptDatasetToTraining.java

Models:
DanielG/
BioDiscML
Downloads: 1
[ce076b]: / src / biodiscml / AdaptDatasetToTraining.java
History
Download this file
335 lines (306 with data), 13.0 kB

/*
 * Get clinical and genes expression
 * Make some feature extraction
 */
package biodiscml;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeMap;
import utils.Weka_module;
import utils.utils.TableObject;
import static utils.utils.*;

/**
 *
 * @author Mickael
 */
public class AdaptDatasetToTraining {

    public static boolean debug = Main.debug;
    public static HashMap<String, String> removedFeatures = new HashMap<>();

    public AdaptDatasetToTraining() {
    }

    /**
     * create dataset with a determined class
     *
     * @param trainFile
     */
    public AdaptDatasetToTraining(String trainFile) {
        //remove files of previous run if exist
        if (!Main.restoreRun || !Main.resumeTraining) {
            System.out.println("Check if files exist and should be deleted...");
            String allDataFile = trainFile.replace("data_to_train", "all_data");
            if (new File(allDataFile).exists()) {
                System.out.println("\t" + allDataFile + " exist... deleting...");
                new File(allDataFile).delete();
            }
            if (new File(trainFile).exists()) {
                System.out.println("\t" + trainFile + " exist... deleting...");
                new File(trainFile).delete();
                new File(trainFile.replace(".csv", ".arff")).delete();
            }
        }

        //create the adapted training file
        System.out.println("# Training file(s)");
        if (Main.doClassification) {
            createFileCompatibleForWeka(Main.classificationClassName, Main.hmTrainFiles, trainFile, Main.separator, true);
        } else {
            createFileCompatibleForWeka(Main.regressionClassName, Main.hmTrainFiles, trainFile, Main.separator, true);
        }
        //create the adapted tested file
        if (Main.doSampling) {
            System.out.println("## Apply sampling configuration");
            //if a test file is provided
            String trainAndTestFile = trainFile.replace("data_to_train", "all_data");
            String testFile = trainFile.replace("data_to_train.csv", "data_to_test.csv");
            Weka_module weka = new Weka_module();
            String trainSetRange = "";
            if (!Main.hmNewDataFiles.isEmpty()) {
                System.out.println("# Testing file(s)");
                if (Main.doClassification) {
                    createFileCompatibleForWeka(Main.classificationClassName, Main.hmNewDataFiles, testFile, Main.separator, false);
                } else {
                    createFileCompatibleForWeka(Main.regressionClassName, Main.hmNewDataFiles, testFile, Main.separator, false);
                }
                //if a test file is provided, we need to merge it to the train file and
                // split it again to preserve a compatible arff format between train and test sets
                trainSetRange = mergeTrainAndTestFiles(trainFile, testFile, trainAndTestFile);

            } else {// else we split
                // just rename the train file to a file that contains all train and test data
                new File(trainFile).renameTo(new File(trainAndTestFile));
            }

            //perform sampling
            weka.sampling(trainAndTestFile, trainFile, testFile, Main.isClassification, trainSetRange);
        }
        System.out.println("");

    }

    /**
     * get common ids between infiles
     *
     * @param al_tables
     * @return
     */
    public HashMap<String, String> getCommonIds(ArrayList<TableObject> al_tables) {
        HashMap<String, String> hm_ids = new HashMap<>();
        //if many infiles
        if (al_tables.size() > 0) {
            HashMap<String, Integer> hm_counts = new HashMap<>();
            //get all ids, count how many times each one is seen
            for (TableObject table : al_tables) {
                for (String s : table.hmIDsList.keySet()) {
                    s = s.toLowerCase();
                    if (hm_counts.containsKey(s)) {
                        int tmp = hm_counts.get(s);
                        tmp++;
                        hm_counts.put(s, tmp);
                    } else {
                        hm_counts.put(s, 1);
                    }
                }
            }
            //check number of times ids have been seen
            for (String s : hm_counts.keySet()) {
                if (hm_counts.get(s) == al_tables.size()) {
                    hm_ids.put(s, "");
                }
            }
        } else {//for one infile
            for (String s : al_tables.get(0).hmIDsList.keySet()) {
                hm_ids.put(s, "");
            }
        }

        return hm_ids;
    }

    private void createFileCompatibleForWeka(String theClass, HashMap<String, String> infiles, String outfile, String separator, Boolean trainingFile) {
        //convert hashmap to list
        String[] files = new String[infiles.size()];
        String[] prefixes = new String[infiles.size()];
        int cpt = 0;
        for (String f : infiles.keySet()) {
            files[cpt] = f;
            prefixes[cpt] = infiles.get(f);
            cpt++;
        }

        //load datasets of features
        if (debug) {
            System.out.println("loading files");
        }

        ArrayList<TableObject> al_tables = new ArrayList<>();
        int classIndex = -1;

        for (int i = 0; i < files.length; i++) {
            String file = files[i];
            if (debug) {
                System.out.println(file);
            }

            TableObject tbo = new TableObject(readTable(file, separator));
            //locate class
            if (tbo.containsClass(theClass)) {
                classIndex = i;
            }
            al_tables.add(tbo);
        }

        //extract class
        ArrayList<String> myClass = new ArrayList<>();
        try {
            myClass = al_tables.get(classIndex).getTheClass(theClass);
        } catch (Exception e) {
            System.err.println("[error] Class " + theClass + " not found. Error in the input file.");
            if (Main.debug) {
                e.printStackTrace();
            }
            System.exit(0);
        }
        //remove useless features having 100% the same value
        if (trainingFile) {
            try {
                for (TableObject tbo : al_tables) {
                    for (String s : tbo.getSortedHmDataKeyset()) {
                        HashMap<String, String> hm = new HashMap<>();
                        for (String value : tbo.hmData.get(s)) {
                            hm.put(value, value);
                        }
                        if (hm.size() == 1) {
                            tbo.hmData.remove(s);
                            if (hm.keySet().toArray()[0].equals("?")) {
                                System.out.println("Removing feature " + s + " "
                                        + "because 100% of values are missing");
                            } else {
                                System.out.println("Removing feature " + s + " "
                                        + "because 100% of values are identical "
                                        + "{" + hm.keySet().toArray()[0] + "}");
                            }
                            removedFeatures.put(s, s);
                        }
                    }
                    cpt++;
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        } else {
            try {
                for (TableObject tbo : al_tables) {
                    for (String s : tbo.getSortedHmDataKeyset()) {
                        if (removedFeatures.containsKey(s)) {
                            tbo.hmData.remove(s);
                        }
                    }
                    cpt++;
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        //create outfile
        if (debug) {
            System.out.println("create outfile " + outfile);
        }
        try {
            PrintWriter pw = new PrintWriter(new FileWriter(outfile));
            ///////// PRINT HEADER
            int featuresCpt = 0;
            pw.print(Main.mergingID);
            cpt = 0;
            for (TableObject tbo : al_tables) {
                for (String s : tbo.getSortedHmDataKeyset()) {
                    if (!Main.hmExcludedFeatures.containsKey(s)) {
                        if (!prefixes[cpt].isEmpty()) {
                            pw.print("\t" + prefixes[cpt] + "__" + s);
                        } else {
                            pw.print("\t" + s);
                        }
                        featuresCpt++;
                    }
                }
                cpt++;
            }
            pw.println("\tclass");
            pw.flush();

            //search for ids present in all datasets
            HashMap<String, String> hm_ids = getCommonIds(al_tables);
            if (debug && al_tables.size() > 1) {
                System.out.println("Total number of common instances between files: " + hm_ids.size());
            } else {
                System.out.println("Total number of instances between files: " + hm_ids.size());
            }

            System.out.println("Total number of features: " + featuresCpt);

            ///////PRINT CONTENT
            TreeMap<String, Integer> tm = new TreeMap<>();
            tm.putAll(al_tables.get(0).hmIDsList);
            int existing_spaces = 0;
            for (String id : tm.keySet()) {
                if (hm_ids.containsKey(id.toLowerCase()) && !id.equals(Main.mergingID.toLowerCase())) {
                    // if (hm_ids.containsKey(id) && !id.equals(Main.mergingID)) {
                    pw.print(id);
                    for (TableObject tbo : al_tables) {
                        int idIndex = tbo.hmIDsList.get(id);
                        for (String s : tbo.getSortedHmDataKeyset()) {
                            if (!Main.hmExcludedFeatures.containsKey(s)) { //if it is not a rejected feature
                                // print values and replace , by .
                                String out = tbo.hmData.get(s).get(idIndex).replace(",", ".").trim();
                                if (out.isEmpty() || out.equals("NA") || out.equals("na")
                                        || out.equals("N/A") || out.equals("n/a")) {
                                    out = "?";
                                }
                                pw.print("\t" + out);
                            }
                        }
                    }
                    String classe = myClass.get(al_tables.get(classIndex).hmIDsList.get(id));
                    if (classe.contains(" ")) {
                        existing_spaces++;
                    }
                    classe = classe.replace(" ", "_");
                    pw.print("\t" + classe);
                    //pw.print("\t" + myClass.get(idIndex).replace("1", "true").replace("0", "false"));
                    pw.println();
                }
            }
            if (existing_spaces > 0) {
                System.out.println("Spaces detected in class label. They were replaced by _");
            }
            pw.flush();

            if (debug) {
                System.out.println("closing outfile " + outfile);
            }
            pw.close();

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     *
     * @param outfile
     * @param replace
     * @return range of the train set (ex: 1-100)
     */
    private String mergeTrainAndTestFiles(String trainFile, String testFile, String trainAndTestFile) {
        int cpt = -1;
        try {
            //create train+set file
            PrintWriter pw = new PrintWriter(new FileWriter(trainAndTestFile));

            //read train
            BufferedReader br = new BufferedReader(new FileReader(trainFile));

            while (br.ready()) {
                pw.println(br.readLine());
                cpt++;
            }
            br.close();
            pw.flush();

            //read test
            br = new BufferedReader(new FileReader(testFile));
            br.readLine(); // skip header
            while (br.ready()) {
                pw.println(br.readLine());
            }
            br.close();
            pw.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "1-" + cpt;
    }

}