ComputerVision-LungCancer / Git / Diff of /Semantic Features/GetRadData.m

Models:
SCallahan/
ComputerVision-LungCancer
Downloads: 1
Diff of /Semantic Features/GetRadData.m [000000] .. [b4b313]
Switch to side-by-side view

--- a
+++ b/Semantic Features/GetRadData.m
@@ -0,0 +1,92 @@
+function [Xraw, Yraw, instanceID, ratingRow, data, histos ] = GetRadData(minAgreement)
+%getRadData Generates the various X, Y matrices based on the radiologists
+%   Can be tweaked based on the minimum number of radiologists giving a
+%   rating for the data to be kept. Gets the feature data and makes an X
+%   matrix, gets the rating data and makes a Y matrix. Also gives you a
+%   histogram distribution of the ratings. 
+
+%Load data
+fprintf('Reading excel file\n'); %This read is the slow part
+[~, ~, rawData] = xlsread('C:\Users\esmith2\Documents\Ethans Code\gitCode\Semantic Features\LIDC_All_Radiologist_Cases_For_Largest_Slice cleaned modified ES recalc Features by PS.xlsx');
+
+ratingColumn = [9,11,12,13,14,15,16]; %columns where the ratings we want are found
+numCategories = length(ratingColumn);
+
+%Get colums of special headers
+for i = 1:size(rawData,2)
+    if strcmp(rawData{1,i}, 't1.coords') == 1
+        featureColumn = i + 1;
+    end
+end
+numFeatures = size(rawData,2) - (featureColumn - 1);
+
+%[~, ~, rawData] = xlsread('C:\Ethan\Dropbox\MedIX\Lung Segmentation\Repo\Semantic Features\cleaned data in order with features.xlsx');
+rawData = rawData(2:end, :); %Remove headers 
+
+%Sort numRatings
+fprintf('Only looking at data with at least %d ratings\n' , minAgreement);
+currentID = rawData(1,6);
+dupCount = 1;
+goodRows = cell(1,4);
+rowStart = 0;
+rowStop = 0;
+for i = 2:size(rawData,1)
+    %fprintf('%d %d %d %d\n', i, strcmp(currentID, rawData(i,6)), dupCount, minAgreement);
+    if strcmp(currentID, rawData(i,6)) == 1 %Still part of existing line
+        dupCount = dupCount + 1;
+    else %Found new set, process it
+        rowStart = rowStop + 1;
+        rowStop = i-1;
+        goodRows{dupCount} = vertcat(goodRows{dupCount}, [rowStart:rowStop]');
+
+        currentID = rawData(i,6);%start new count
+        dupCount = 1;
+    end
+end
+%process last set
+rowStart = rowStop + 1;
+rowStop = i;
+goodRows{dupCount} = vertcat(goodRows{dupCount}, [rowStart:rowStop]');
+
+%form sorted data   
+data = rawData(goodRows{4},:);
+for i = 3:-1:minAgreement
+    data = vertcat(data, rawData(goodRows{i},:) );
+end
+%fprintf('4 %d 3 %d 2 %d 1 %d Total %d\n', length(goodRows{4}), length(goodRows{3}), length(goodRows{2}), length(goodRows{1}), length(goodRows{4}) + length(goodRows{3}) + length(goodRows{2}) + length(goodRows{1}));
+
+%Get indexes for where each group starts and stops (all the 4s, all the
+%3s etc..
+ratingRow = zeros(1,4);
+ratingRowGroup = zeros(1,4);
+ratingRow(1,4) = 1;
+ratingRowGroup(1,4) = 1;
+for i = 3:-1:1
+    ratingRow(1,i)      = length(goodRows{i+1}) + ratingRow(1,i+1);
+    ratingRowGroup(1,i) = length(goodRows{i+1}) / (i+1) + ratingRowGroup(1,i+1);
+end
+%Get instance IDs for each row so we can match up with features
+%extracted in a different order
+instanceID = cell2mat(data(:,1));
+
+fprintf('Selecting all data\n'); 
+%Get strait one to one values
+Xraw = cell2mat(data(:,featureColumn:featureColumn+numFeatures-1));
+Yraw = cell2mat(data(:,ratingColumn));
+
+%Make histograms for check against broken clock/noise
+%tabulate(Ytest(:,1)) function is better :-(
+histos = zeros(numCategories, 5);
+for i = 1:numCategories
+    YVectors = Y2YVectors(Yraw(:,i), 5);
+    histos(i,:) = sum(YVectors)/ sum(sum(YVectors));
+end
+%histos = [histos, zeros(numCategories, 1)];
+%histos = horzcat(histos, max(histos,[], 2));
+
+clear localDirectoryName serverDirectoryName lidcDirectoryName directoryName dataFileFullName YVectors ...
+    categories currentID dupCount featureColumn featureDirectory i message numCategories 
+    
+%clear rawData goodRows
+end
+