#步骤4.Performing the regression
train_cols = data.columns[1:]
# Index([gre, gpa, prestige_2, prestige_3, prestige_4], dtype=object)
logit = sm.Logit(data['admit'], data[train_cols])
# fit the model
result = logit.fit()
#步骤5. Interpreting the result
# cool enough to deserve it's own gist
print (result.summary())
# look at the confidence interval of each coeffecient
print (result.conf_int())
# 0 1
# gre 0.000120 0.004409
# gpa 0.153684 1.454391
# prestige_2 -1.295751 -0.055135
# prestige_3 -2.016992 -0.663416
# prestige_4 -2.370399 -0.732529
# intercept -6.224242 -1.755716
#odds ratio
#Take the exponential of each of the coeffecients to generate the odds ratios.
# This tells you how a 1 unit increase or decrease in a variable affects the odds of being admitted.
# odds ratios only
print (np.exp(result.params))
# gre 1.002267
# gpa 2.234545
# prestige_2 0.508931
# prestige_3 0.261792
# prestige_4 0.211938
# intercept 0.018500
#预测
data['admit_pred']=result.predict(data[train_cols])
print (data.head())
#更多http://blog.yhathq.com/posts/logistic-regression-and-python.html
def max_Entropy():
#这里直接使用nltk中的最大熵训练器
import nltk
#1.提取特征,使用一个字典,这个字典称作特征集
#1.1 特征提取器1
def gender_feature(word):
return {'last_letter':word[-1]}
#1.2 特征提取器2
def gender_feature2(word):
return {'suffix1': word[-1:],'suffix2': word[-2:]}
#1.3 特征提取器3 一个过拟合的特征提取器
def gender_feature3(name):
feature={}
feature['firstletter']=name[0].lower()
feature['lastletter']=name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
# for letter in name:
feature['count(%s)'%letter]=name.lower().count(letter)
feature['has(%s)'%letter]=(letter in name.lower())
return feature
#2.生成训练集和测试集(包括预处理)
from nltk.corpus import names
import random
_names=[(name,'male') for name in names.words('male.txt')]+\
[(name,'female') for name in names.words('female.txt')]
random.shuffle(_names)
#分类器的输入的特征是 [(特征,组别)```]
featureset=[(gender_feature(name),g) for (name,g) in _names]
train_set,test_set=featureset[500:],featureset[:500]
#另一种变化的做法是 数据集分成三部分 开发训练集,开发测试集,测试集
#开发测试集用于检测,查看错误
def max_Entropy():
#这里直接使用nltk中的最大熵训练器
import nltk
#1.提取特征,使用一个字典,这个字典称作特征集
#1.1 特征提取器1
def gender_feature(word):
return {'last_letter':word[-1]}
#1.2 特征提取器2
def gender_feature2(word):
return {'suffix1': word[-1:],'suffix2': word[-2:]}
#1.3 特征提取器3 一个过拟合的特征提取器
def gender_feature3(name):
feature={}
feature['firstletter']=name[0].lower()
feature['lastletter']=name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
# for letter in name:
feature['count(%s)'%letter]=name.lower().count(letter)
feature['has(%s)'%letter]=(letter in name.lower())
return feature
#2.生成训练集和测试集(包括预处理)
from nltk.corpus import names
import random
_names=[(name,'male') for name in names.words('male.txt')]+\
[(name,'female') for name in names.words('female.txt')]
random.shuffle(_names)
#分类器的输入的特征是 [(特征,组别)```]
featureset=[(gender_feature(name),g) for (name,g) in _names]
train_set,test_set=featureset[500:],featureset[:500]
#另一种变化的做法是 数据集分成三部分 开发训练集,开发测试集,测试集
#开发测试集用于检测,查看错误