|
a |
|
b/Semantic Features/GetRadData.m |
|
|
1 |
function [Xraw, Yraw, instanceID, ratingRow, data, histos ] = GetRadData(minAgreement) |
|
|
2 |
%getRadData Generates the various X, Y matrices based on the radiologists |
|
|
3 |
% Can be tweaked based on the minimum number of radiologists giving a |
|
|
4 |
% rating for the data to be kept. Gets the feature data and makes an X |
|
|
5 |
% matrix, gets the rating data and makes a Y matrix. Also gives you a |
|
|
6 |
% histogram distribution of the ratings. |
|
|
7 |
|
|
|
8 |
%Load data |
|
|
9 |
fprintf('Reading excel file\n'); %This read is the slow part |
|
|
10 |
[~, ~, rawData] = xlsread('C:\Users\esmith2\Documents\Ethans Code\gitCode\Semantic Features\LIDC_All_Radiologist_Cases_For_Largest_Slice cleaned modified ES recalc Features by PS.xlsx'); |
|
|
11 |
|
|
|
12 |
ratingColumn = [9,11,12,13,14,15,16]; %columns where the ratings we want are found |
|
|
13 |
numCategories = length(ratingColumn); |
|
|
14 |
|
|
|
15 |
%Get colums of special headers |
|
|
16 |
for i = 1:size(rawData,2) |
|
|
17 |
if strcmp(rawData{1,i}, 't1.coords') == 1 |
|
|
18 |
featureColumn = i + 1; |
|
|
19 |
end |
|
|
20 |
end |
|
|
21 |
numFeatures = size(rawData,2) - (featureColumn - 1); |
|
|
22 |
|
|
|
23 |
%[~, ~, rawData] = xlsread('C:\Ethan\Dropbox\MedIX\Lung Segmentation\Repo\Semantic Features\cleaned data in order with features.xlsx'); |
|
|
24 |
rawData = rawData(2:end, :); %Remove headers |
|
|
25 |
|
|
|
26 |
%Sort numRatings |
|
|
27 |
fprintf('Only looking at data with at least %d ratings\n' , minAgreement); |
|
|
28 |
currentID = rawData(1,6); |
|
|
29 |
dupCount = 1; |
|
|
30 |
goodRows = cell(1,4); |
|
|
31 |
rowStart = 0; |
|
|
32 |
rowStop = 0; |
|
|
33 |
for i = 2:size(rawData,1) |
|
|
34 |
%fprintf('%d %d %d %d\n', i, strcmp(currentID, rawData(i,6)), dupCount, minAgreement); |
|
|
35 |
if strcmp(currentID, rawData(i,6)) == 1 %Still part of existing line |
|
|
36 |
dupCount = dupCount + 1; |
|
|
37 |
else %Found new set, process it |
|
|
38 |
rowStart = rowStop + 1; |
|
|
39 |
rowStop = i-1; |
|
|
40 |
goodRows{dupCount} = vertcat(goodRows{dupCount}, [rowStart:rowStop]'); |
|
|
41 |
|
|
|
42 |
currentID = rawData(i,6);%start new count |
|
|
43 |
dupCount = 1; |
|
|
44 |
end |
|
|
45 |
end |
|
|
46 |
%process last set |
|
|
47 |
rowStart = rowStop + 1; |
|
|
48 |
rowStop = i; |
|
|
49 |
goodRows{dupCount} = vertcat(goodRows{dupCount}, [rowStart:rowStop]'); |
|
|
50 |
|
|
|
51 |
%form sorted data |
|
|
52 |
data = rawData(goodRows{4},:); |
|
|
53 |
for i = 3:-1:minAgreement |
|
|
54 |
data = vertcat(data, rawData(goodRows{i},:) ); |
|
|
55 |
end |
|
|
56 |
%fprintf('4 %d 3 %d 2 %d 1 %d Total %d\n', length(goodRows{4}), length(goodRows{3}), length(goodRows{2}), length(goodRows{1}), length(goodRows{4}) + length(goodRows{3}) + length(goodRows{2}) + length(goodRows{1})); |
|
|
57 |
|
|
|
58 |
%Get indexes for where each group starts and stops (all the 4s, all the |
|
|
59 |
%3s etc.. |
|
|
60 |
ratingRow = zeros(1,4); |
|
|
61 |
ratingRowGroup = zeros(1,4); |
|
|
62 |
ratingRow(1,4) = 1; |
|
|
63 |
ratingRowGroup(1,4) = 1; |
|
|
64 |
for i = 3:-1:1 |
|
|
65 |
ratingRow(1,i) = length(goodRows{i+1}) + ratingRow(1,i+1); |
|
|
66 |
ratingRowGroup(1,i) = length(goodRows{i+1}) / (i+1) + ratingRowGroup(1,i+1); |
|
|
67 |
end |
|
|
68 |
%Get instance IDs for each row so we can match up with features |
|
|
69 |
%extracted in a different order |
|
|
70 |
instanceID = cell2mat(data(:,1)); |
|
|
71 |
|
|
|
72 |
fprintf('Selecting all data\n'); |
|
|
73 |
%Get strait one to one values |
|
|
74 |
Xraw = cell2mat(data(:,featureColumn:featureColumn+numFeatures-1)); |
|
|
75 |
Yraw = cell2mat(data(:,ratingColumn)); |
|
|
76 |
|
|
|
77 |
%Make histograms for check against broken clock/noise |
|
|
78 |
%tabulate(Ytest(:,1)) function is better :-( |
|
|
79 |
histos = zeros(numCategories, 5); |
|
|
80 |
for i = 1:numCategories |
|
|
81 |
YVectors = Y2YVectors(Yraw(:,i), 5); |
|
|
82 |
histos(i,:) = sum(YVectors)/ sum(sum(YVectors)); |
|
|
83 |
end |
|
|
84 |
%histos = [histos, zeros(numCategories, 1)]; |
|
|
85 |
%histos = horzcat(histos, max(histos,[], 2)); |
|
|
86 |
|
|
|
87 |
clear localDirectoryName serverDirectoryName lidcDirectoryName directoryName dataFileFullName YVectors ... |
|
|
88 |
categories currentID dupCount featureColumn featureDirectory i message numCategories |
|
|
89 |
|
|
|
90 |
%clear rawData goodRows |
|
|
91 |
end |
|
|
92 |
|