python实现的k-means算法(原创)

gsbyqjkwdg · 发表于 2015-4-20 10:22:26

1 #! /usr/bin/env python
  2 # -*- coding: utf-8 -*-
  3 import os
  4 import sys
  5 import cmath
  6 import os.path
  7
  8 class KMeans:
  9    '''
10    @descriptions: K-means Algorithm implementation.
11    @filename:    Filename of input data.
12    @knums:       Clusters number.
13 '''
14    def __init__(self, filename, knums):
15       self._filename = filename;
16       self._knums = knums
17       self._dimension = 0
18       """self._samples := [(seqx, x1, x2, ..., xn),
19                            (seqy, y1, y2, ..., yn),
20                            ...,
21                            (seqz, z1, z2, ..., zn)]"""
22       self._samples= []
23       """self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)],
24                            [],
25                            ...,
26                            []]"""
27       self._clusters = []
28
29       self._open(self._filename)
30       self._normalize()
31       #print self._samples
32       self._select(self._knums)
33
34
35    def _normalize(self):
36       """
37       @description: Normalize the attributes of input data.
38 """
39       new_samples = []
40       for t in xrange(len(self._samples)):
41          st = list(self._samples[t])
42          new_samples.append(st)
43
44       for t in xrange(len(self._samples)):
45          self._samples.pop()
46
47       for d in xrange(1, (self._dimension + 1)):
48          container_att = []
49          for idx in xrange(len(new_samples)):
50                att = new_samples[idx][d]
51                container_att.append(att)
52
53          max_att = max(container_att)
54          min_att = min(container_att)
55
56          for idx in xrange(len(new_samples)):
57                new_att = (new_samples[idx][d] - min_att) / (max_att - min_att)
58                new_samples[idx][d] = new_att
59
60       for t in xrange(len(new_samples)):
61          st = tuple(new_samples[t])
62          self._samples.append(st)
63
64
65
66    def _open(self, filename):
67       """
68       @descriptions: Open the data file and fill each item into memory.
69       @filename : Filename of input data.
70 """
71       data_file= open(self._filename, "r")
72       data_lines= data_file.readlines();
73       for line in data_lines:
74          string_samples = line.split("")
75          integer_samples= []
76
77          integer_samples.append(int(string_samples[0]))
78
79          for e in string_samples[1:]:
80                integer_samples.append(float(e))
81          samples = tuple(integer_samples)
82          self._samples.append(samples)
83       #print self._samples
84       self._dimension = len(self._samples[0]) - 1
85       #print self._dimension
86
87
88    def _select(self, knums):
89       """
90       @descriptions: Choose the first knums cluster center.
91       @knums    : Clusters number.
92 """
93       for i in xrange(knums):
94          selected = self._samples
95          temp = list(selected)
96          temp[0] = 0
97          self._clusters.append([])
98          self._clusters.append(temp)
99       #print self._clusters
100
101
102    def _distance(self, va, vb):
103       '''
104       @description: Return the (distance ** 2) of tuple va and tuple vb.
105       @va       : tuple va (x1, x2, ..., xn)
106       @vb       : tuple vb (y1, y2, ..., yn)
107 '''
108       distance = 0
109       for i in xrange(self._dimension):
110          distance += (va - vb) * (va - vb)
111       #print distance
112
113       return distance
114
115
116    def _means(self, va):
117       """
118       @description: Return the means of va.
119       @va       : A tuple of list va, with the form [(flagx, x1, x2, ..., xn),
120                                                       (flagy, y1, y2, ..., yn),
121                                                       (flagz, z1, z2, ..., zn), ...]
122 """
123       if (len(va) == 0):
124          return va
125
126       means_cluster = []
127       means_cluster.append(1)#Indicate that the means has changed.
128
129       #print va
130       for d in xrange(self._dimension):
131          tmp = 0
132          for i in xrange(len(va)):
133                tmp += va[d+1]
134          means_cluster.append(tmp/len(va))
135       means = tuple(means_cluster)
136
137       return means
138
139    def _equal(self, ta, tb):
140       """
141       @description: Check if tuple ta equals to tuple tb.
142       @ta       : Tuple ta.(flagx, x1, x2, ..., xn)
143       @tb       : Tuple tb.(flagy, y1, y1, ..., ym)
144 """
145       if (len(ta) != len(tb)):
146          return False
147
148       for i in xrange(1, len(ta)):
149          if (ta != tb):
150                return False
151
152       return True
153
154    def flush(self, filename):
155       """
156       @description: Flush data the disk.
157       @filename : Filename of output data.
158 """
159       foutput = open(filename, "w")
160
161       for c in xrange(self._knums):
162          foutput.write("Group %d" % c)
163          for e in self._clusters[c][1:]:
164                foutput.write("%s" %  repr(e))
165          foutput.write("\n\n\n")
166       print("Done.")
167       foutput.close()
168
169    def _reconstruct(self, idx):
170       """
171       @description: Reconstruct the cluster points.
172
173       @idx       : Index of clusters, where clusters has the form as follows:
174       self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)],
175                         [],
176                         ...,
177                         []]
178 """
179       new_cluster = []
180       new_cluster.append(0)
181       for old_value in self._clusters[idx][0][1:]:
182          new_cluster.append(old_value)
183       for i in xrange(len(self._clusters[idx])):
184          self._clusters[idx].pop()
185       self._clusters[idx].insert(0, new_cluster)
186
187
188    def process(self):
189       """
190       @description: Process data, calculating k-means and clustering.
191 """
192       while True:
193          K = 0
194          for e in self._samples:
195                #print e
196                shortest = -1
197                for k in xrange(self._knums):
198                   #for k in _clusters[]
199                   #print e
200                   #print self._clusters[k][0]
201                   distance = self._distance(e[1:], self._clusters[k][0][1:])
202                   #print distance
203                   if (distance < 0.000001):
204                      # add e to the k-th cluster.
205                      self._clusters[k].append(e)
206                      break
207                   else:
208                      if (shortest == -1):
209                            shortest = distance
210                      else:
211                            if (shortest > distance):
212                               shortest = distance
213                               K = k
214                   if (k != self._knums - 1):
215                      continue
216                   else:
217                      # add e to the k-th cluster
218                      self._clusters[K].append(e)
219          #print self._clusters
220
221          for k in xrange(self._knums):
222                new_ktuple = self._means(self._clusters[k][1:])
223                if (len(new_ktuple) == 0):
224                   continue
225                if (self._equal(self._clusters[k][0], new_ktuple) == False):
226                   self._clusters[k].pop(0)
227                   self._clusters[k].insert(0, new_ktuple)
228
229                else:
230                   continue
231
232          flag = 0
233          for idx in xrange(self._knums):
234                if (self._clusters[idx][0][0] == 1):
235                   flag = 1
236                   break
237                else:
238                   continue
239
240          if (flag == 1):
241                for idx in xrange(self._knums):
242                   self._reconstruct(idx)
243          else:
244                break
245
246
247 if __name__ =="__main__":
248    ikmeans = KMeans("./iris-1.dat", 3)
249    ikmeans.process()
250    ikmeans.flush("./k-means-out.dat")
　　K-means算法的python代码，写完 + 调试花了差不多一天的时间，希望对大家有用。关于K-means聚类算法和ISODATA算法解释见下一篇博文。

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] python实现的k-means算法(原创)

浏览过的版块

扫码加入运维网微信交流群