Switch to unified view

a b/Semantic Features/GetRadData.m
1
function [Xraw, Yraw, instanceID, ratingRow, data, histos ] = GetRadData(minAgreement)
2
%getRadData Generates the various X, Y matrices based on the radiologists
3
%   Can be tweaked based on the minimum number of radiologists giving a
4
%   rating for the data to be kept. Gets the feature data and makes an X
5
%   matrix, gets the rating data and makes a Y matrix. Also gives you a
6
%   histogram distribution of the ratings. 
7
8
%Load data
9
fprintf('Reading excel file\n'); %This read is the slow part
10
[~, ~, rawData] = xlsread('C:\Users\esmith2\Documents\Ethans Code\gitCode\Semantic Features\LIDC_All_Radiologist_Cases_For_Largest_Slice cleaned modified ES recalc Features by PS.xlsx');
11
12
ratingColumn = [9,11,12,13,14,15,16]; %columns where the ratings we want are found
13
numCategories = length(ratingColumn);
14
15
%Get colums of special headers
16
for i = 1:size(rawData,2)
17
    if strcmp(rawData{1,i}, 't1.coords') == 1
18
        featureColumn = i + 1;
19
    end
20
end
21
numFeatures = size(rawData,2) - (featureColumn - 1);
22
23
%[~, ~, rawData] = xlsread('C:\Ethan\Dropbox\MedIX\Lung Segmentation\Repo\Semantic Features\cleaned data in order with features.xlsx');
24
rawData = rawData(2:end, :); %Remove headers 
25
26
%Sort numRatings
27
fprintf('Only looking at data with at least %d ratings\n' , minAgreement);
28
currentID = rawData(1,6);
29
dupCount = 1;
30
goodRows = cell(1,4);
31
rowStart = 0;
32
rowStop = 0;
33
for i = 2:size(rawData,1)
34
    %fprintf('%d %d %d %d\n', i, strcmp(currentID, rawData(i,6)), dupCount, minAgreement);
35
    if strcmp(currentID, rawData(i,6)) == 1 %Still part of existing line
36
        dupCount = dupCount + 1;
37
    else %Found new set, process it
38
        rowStart = rowStop + 1;
39
        rowStop = i-1;
40
        goodRows{dupCount} = vertcat(goodRows{dupCount}, [rowStart:rowStop]');
41
42
        currentID = rawData(i,6);%start new count
43
        dupCount = 1;
44
    end
45
end
46
%process last set
47
rowStart = rowStop + 1;
48
rowStop = i;
49
goodRows{dupCount} = vertcat(goodRows{dupCount}, [rowStart:rowStop]');
50
51
%form sorted data   
52
data = rawData(goodRows{4},:);
53
for i = 3:-1:minAgreement
54
    data = vertcat(data, rawData(goodRows{i},:) );
55
end
56
%fprintf('4 %d 3 %d 2 %d 1 %d Total %d\n', length(goodRows{4}), length(goodRows{3}), length(goodRows{2}), length(goodRows{1}), length(goodRows{4}) + length(goodRows{3}) + length(goodRows{2}) + length(goodRows{1}));
57
58
%Get indexes for where each group starts and stops (all the 4s, all the
59
%3s etc..
60
ratingRow = zeros(1,4);
61
ratingRowGroup = zeros(1,4);
62
ratingRow(1,4) = 1;
63
ratingRowGroup(1,4) = 1;
64
for i = 3:-1:1
65
    ratingRow(1,i)      = length(goodRows{i+1}) + ratingRow(1,i+1);
66
    ratingRowGroup(1,i) = length(goodRows{i+1}) / (i+1) + ratingRowGroup(1,i+1);
67
end
68
%Get instance IDs for each row so we can match up with features
69
%extracted in a different order
70
instanceID = cell2mat(data(:,1));
71
72
fprintf('Selecting all data\n'); 
73
%Get strait one to one values
74
Xraw = cell2mat(data(:,featureColumn:featureColumn+numFeatures-1));
75
Yraw = cell2mat(data(:,ratingColumn));
76
77
%Make histograms for check against broken clock/noise
78
%tabulate(Ytest(:,1)) function is better :-(
79
histos = zeros(numCategories, 5);
80
for i = 1:numCategories
81
    YVectors = Y2YVectors(Yraw(:,i), 5);
82
    histos(i,:) = sum(YVectors)/ sum(sum(YVectors));
83
end
84
%histos = [histos, zeros(numCategories, 1)];
85
%histos = horzcat(histos, max(histos,[], 2));
86
87
clear localDirectoryName serverDirectoryName lidcDirectoryName directoryName dataFileFullName YVectors ...
88
    categories currentID dupCount featureColumn featureDirectory i message numCategories 
89
    
90
%clear rawData goodRows
91
end
92