jiang1799 发表于 2018-8-12 08:02:20

朴素贝叶斯算法的Python实现

# Author :Wenxiang Cui  
# Date :2015/9/11
  
# Function: A classifier which using naive Bayesian algorithm
  

  
import math
  

  
class Bayesian:
  def __init__(self):
  self.dataS = [] # 训练样本集DataSource
  self.attriList = [] # 属性集合
  self.desClass = 0 # 分类目标属性在attriList中的位置
  def loadDataS(self,fileName,decollator):
  #input:
  #fileName - DataSource 的文件名
  #decollator - DataSource 中每个字段之间的分割符,有可能是空格或','
  #function :
  #从磁盘中读取数据并转化为较好处理的列表
  items = []
  fp = open(filename,'r')
  lines = fp.readlines()
  for line in lines:
  line = line.strip('\n')
  items.append(line)
  fp.close()
  

  i = 0
  b = []
  for i in range(len(items)):
  b.append(items.split(decollator))
  self.dataS = b[:]
  def getAttriList(self,attributes):
  #input:
  #attributes - 训练数据集中的属性集合,必须与dataSource中的列相对应
  #function:
  #获得训练数据集的属性列表
  self.attriList = attributes[:]
  def getDesClass(self,loca):
  #input:
  #loca - 分类目标属性在attriList中的位置
  #function:
  #获得分类目标属性在attriList中的位置
  self.desClass = loca
  def calPriorProb(self):
  #input:
  #
  #function:
  #计算类的先验概率
  dictFreq = {} # 构建频度表,用字典表示
  desLabel = []
  sampleNum = 0
  for itemsin self.dataS:
  sampleNum += 1
  if not items in dictFreq:
  dictFreq] = 1
  desLabel.append(items)
  else:
  dictFreq] += 1
  dictPriorP = {} # 构建先验概率表,用字典表示
  for item in desLabel:
  dictPriorP = float(dictFreq) / sampleNum
  self.PriorP = dictPriorP[:]
  self.classLabel = desLabel[:]
  def calProb(self,type,loca):
  #input:
  #type - 定义属性是连续的还是离散的
  #   loca - 该属性在属性集中的位置
  #output:
  #dictPara - 连续属性的样本均值和方差(列表表示)
  #dictProb - 离散属性的类条件概率
  #function:
  #计算某个属性的类条件概率密度
  if type == 'continuous':
  dictData = [] # 提取出样本的类别和当前属性值
  dictPara = [] # 记录样本的类别和其对应的样本均值和方差
  for item in self.classLabel:
  dictData.append([])
  dictPara.append([])
  for items in self.dataS:
  dataIndex = self.classLabel.index(items) # 返回当前样本类属性
  dictData.append(float(items)) # 记录当前属性值及该样本的类属性
  #计算类属性的样本均值和方差(可以用Numpy包来快速处理)
  for i in range(len(self.classLabel)):
   = self.calParam(dictData)
  dictPara.append(a)
  dictPara.append(b)
  return dictPara
  elif type == 'discrete':
  dictFreq = {}
  dictProb = {}
  for item in self.classLabel:# 构建频度表,用字典表示
  dictFreq = {}
  dictProb = {}
  label = []
  for items in self.dataS:
  if not items in label:
  label.append(items)
  dictFreq]] = 1
  else:
  dictFreq]] += 1
  needLaplace = 0
  for key in dictFreq.keys():
  for ch in labels:
  if ch not in dictFreq:
  dictFreq = 0
  needLaplace = 1
  if needLaplace == 1: # 拉普拉斯平滑用于处理类条件概率为0的情况
  dictFreq = self.LaplaceEstimator(dictFreq)
  needLaplace = 0
  for item in self.classLabel:
  for ch in dictFreq:
  dictProb = float(dictFreq) / self.dictFreq
  return dictProb
  else:
  print 'Wrong type!'
  def calParam(self,souList):
  #input:
  #souList - 待计算的列表
  #output:
  #meanVal - 列表元素的均值
  # deviation - 列表元素的标准差
  #function:
  #计算某个属性的类条件概率密度
  meanVal = sum(souList) / float(len(souList))
  deviation = 0
  tempt = 0
  for val in souList:
  tempt += (val - meanVal)**2
  deviation = math.sqrt(float(tempt)/(len(souList)-1))
  return meanVal,deviation
  def LaplaceEstimator(self,souDict):
  #input:
  #souDict - 待计算的字典
  #output:
  #desDict - 平滑后的字典
  #function:
  #拉普拉斯平滑
  desDict = souDict.copy()
  for key in souDict:
  desDict = souDict + 1
  return desDict
  

  
class CarBayesian(Bayesian):
  def __init__(self):
  Bayesian.__init__(self)
  self.buying = {}
  self.maint = {}
  self.doors = {}
  self.persons = {}
  self.lug_boot = {}
  self.safety = {}
  def tranning(self):
  self.Prob = []
  self.buying = Bayesian.calProb('discrete',0)
  self.maint = Bayesian.calProb('discrete',1)
  self.doors = Bayesian.calProb('discrete',2)
  self.persons = Bayesian.calProb('discrete',3)
  self.lug_boot = Bayesian.calProb('discrete',4)
  self.safety = Bayesian.calProb('discrete',5)
  

  self.Prob.append(self.buying)
  self.Prob.append(self.maint)
  self.Prob.append(self.doors)
  self.Prob.append(self.persons)
  self.Prob.append(self.lug_boot)
  self.Prob.append(self.safety)
  def classify(self,sample):
  #input :
  # sample - 一个样本
  #function:
  # 判断输入的这个样本的类别
  posteriorProb = {}
  for item in self.classLabel:
  posteriorProb = self.PriorP
  for i in range(len(sample)-1):
  posteriorProb *= self.Prob]
  maxVal = posteriorProb]
  i = 0
  for item in posteriorProb:
  i += 1
  if posteriorProb > maxVal:
  maxVal = posteriorProb
  location = i
  print "该样本属于的类别是:",self.classLabel
  

  

  
filename = "D:\MyDocuments-HnH\DataMining\DataSets\Car\Car_Data.txt"
  
MyCar = CarBayesian()
  
MyCar.loadDataS(filename,',')
  
attributes = ['buying','maint','doors','persons','lug_boot','safety']
  
MyCar.getAttriList(attributes)
  
MyCar.getDesClass(7-1)
  
MyCar.tranning()
  
sample = ['vhigh','vhigh','2','2','small','low']
页: [1]
查看完整版本: 朴素贝叶斯算法的Python实现