SELECT
t1.feature_name,
SUM((ea_all/es)*EA) as gain,
SUM(NVL(-(ea_all/ES)*log2(ea_all/es),0)) as info,--计算信息增益率的分母
SUM((ea_all/es)*EA)/SUM(NVL(-(ea_all/es)*log2(ea_all/es),0)) as gain_rate--信息增益率计算
FROM
(
SELECT
feature_name,
feature_value,
ea_all,
--Key Step2 对于整体熵,要记得更换符号,NVL的出现是防止计算log2(0)得NULL
case
when feature_value='-100' then -(NVL((ea_positive/ea_all)*log2(ea_positive/ea_all),0)+NVL((ea_negative/ea_all)*log2(ea_negative/ea_all),0))
else (NVL((ea_positive/ea_all)*log2(ea_positive/ea_all),0)+NVL((ea_negative/ea_all)*log2(ea_negative/ea_all),0))
end as EA
FROM
(
SELECT
feature_name,
feature_value,
SUM(case when is_lost=-100 then user_cnt else 0 end) as ea_all,
SUM(case when is_lost=1 then user_cnt else 0 end) as ea_positive,
SUM(case when is_lost=0 then user_cnt else 0 end) as ea_negative
FROM
(
SELECT
feature_name,
--Key Step1 对feature值和label值做汇总统计,1、用于熵计算的分母,2、计算整体熵情况
case when grouping(feature_value)=1 then '-100' else feature_value end as feature_value,
case when grouping(is_lost)=1 then -100 else is_lost end as is_lost,
COUNT(1) as user_cnt
FROM
(
SELECT feature_name,feature_value,is_lost FROM gain_caculate
)GROUP BY feature_name,cube(feature_value,is_lost)
)GROUP BY feature_name,feature_value
)
)t1 join
(
--Key Step3信息增益计算时,需要给出样本总量作为分母
SELECT feature_name,COUNT(1) as es FROM gain_caculate
GROUP BY feature_name
)t2 on t1.feature_name=t2.feature_name
GROUP BY t1.feature_name