function InforGain = gain(data)
[m, n] = size(data);
InforGain = zeros(n-1,2);
labels = data(:,n);
for i=1:n
tmp{i} = [];
percen{i} = [];
rate=[];
col = data(:,i);
unicol = unique(col);
%disp(unicol);
%计算每一列有几类,并把每一类的信息熵和比例存储起来
if i<42
for j = 1:length(unicol)
num = length(find(col==unicol(j)));
pnum = length(find(col==unicol(j) & labels == 0));
pnum1 = length(find(col==unicol(j) & labels == 1));
pnum2 = length(find(col==unicol(j) & labels == 2));
rate1 = pnum/num;
rate2 = pnum1/num;
rate3 = pnum2/num;
rate=[rate1 rate2 rate3];
rate(isnan(rate)) = 0;
a=rate(1)*log2(rate(1));
b=rate(2)*log2(rate(2));
c=rate(3)*log2(rate(3));
if(isnan(a))
a=0;
end
if(isnan(b))
b=0;
end
if(isnan(c))
c=0;
end
gain = -(a+b+c);
tmp{i}=[tmp{i} gain];
percen{i}=[percen{i} num/length(col)];
%disp(percen{i});
end
end
if i==42
pnum3 = length(find(labels == 0));
pnum4 = length(find(labels == 1));
pnum5 = length(find(labels == 2));
rate1 = pnum3/length(labels);
rate2 = pnum4/length(labels);
rate3 = pnum5/length(labels);
gain = -(rate1*log2(rate1)+rate2*log2(rate2)+rate3*log2(rate3));
tmp{42}=[tmp{42} gain];
percen{42}=[percen{42} num/length(col)];
end
end
%整体信息熵
InforEntropy = tmp{length(tmp)}(1)
disp(length(tmp));
disp(InforEntropy);
%将NAN转化为0
for i = 1:length(tmp)
tmp{i}(isnan(tmp{i})) = 0;
disp(1111111111111111111);
disp(tmp{i});
end
%求每一个属性列的信息增益
for i = 1:length(percen)-1
InforGain(i,:) = [i,roundn(InforEntropy-sum(tmp{i}.*percen{i}),-4)];
%disp(InforEntropy-sum(tmp{i}.*percen{i}));
%disp(InforEntropy);
end
end
close all;
clear all;
clc;
data = csvread('corrected9.csv');
InforGain = hanshu(data);