|
1 from numpy import array, append, vstack, transpose, reshape, \
2 dot, true_divide, mean, exp, sqrt, log, \
3 loadtxt, savetxt, zeros, frombuffer
4 from numpy.linalg import norm, lstsq
5 from multiprocessing import Process, Array
6 from random import sample
7 from time import time
8 from sys import stdout
9 from ctypes import c_double
10 from h5py import File
11
12
13 def metrics(a, b):
14 return norm(a - b)
15
16
17 def gaussian (x, mu, sigma):
18 return exp(- metrics(mu, x)**2 / (2 * sigma**2))
19
20
21 def multiQuadric (x, mu, sigma):
22 return pow(metrics(mu,x)**2 + sigma**2, 0.5)
23
24
25 def invMultiQuadric (x, mu, sigma):
26 return pow(metrics(mu,x)**2 + sigma**2, -0.5)
27
28
29 def plateSpine (x,mu):
30 r = metrics(mu,x)
31 return (r**2) * log(r)
32
33
34 class Rbf:
35 def __init__(self, prefix = 'rbf', workers = 4, extra_neurons = 0, from_files = None):
36 self.prefix = prefix
37 self.workers = workers
38 self.extra_neurons = extra_neurons
39
40 # Import partial model
41 if from_files is not None:
42 w_handle = self.w_handle = File(from_files['w'], 'r')
43 mu_handle = self.mu_handle = File(from_files['mu'], 'r')
44 sigma_handle = self.sigma_handle = File(from_files['sigma'], 'r')
45
46 self.w = w_handle['w']
47 self.mu = mu_handle['mu']
48 self.sigmas = sigma_handle['sigmas']
49
50 self.neurons = self.sigmas.shape[0]
51
52 def _calculate_error(self, y):
53 self.error = mean(abs(self.os - y))
54 self.relative_error = true_divide(self.error, mean(y))
55
56 def _generate_mu(self, x):
57 n = self.n
58 extra_neurons = self.extra_neurons
59
60 # TODO: Make reusable
61 mu_clusters = loadtxt('clusters100.txt', delimiter='\t')
62
63 mu_indices = sample(range(n), extra_neurons)
64 mu_new = x[mu_indices, :]
65 mu = vstack((mu_clusters, mu_new))
66
67 return mu
68
69 def _calculate_sigmas(self):
70 neurons = self.neurons
71 mu = self.mu
72
73 sigmas = zeros((neurons, ))
74 for i in xrange(neurons):
75 dists = [0 for _ in xrange(neurons)]
76 for j in xrange(neurons):
77 if i != j:
78 dists[j] = metrics(mu, mu[j])
79 sigmas = mean(dists)* 2
80 # max(dists) / sqrt(neurons * 2))
81 return sigmas
82
83 def _calculate_phi(self, x):
84 C = self.workers
85 neurons = self.neurons
86 mu = self.mu
87 sigmas = self.sigmas
88 phi = self.phi = None
89 n = self.n
90
91
92 def heavy_lifting(c, phi):
93 s = jobs[c][1] - jobs[c][0]
94 for k, i in enumerate(xrange(jobs[c][0], jobs[c][1])):
95 for j in xrange(neurons):
96 # phi[i, j] = metrics(x[i,:], mu[j])**3)
97 # phi[i, j] = plateSpine(x[i,:], mu[j]))
98 # phi[i, j] = invMultiQuadric(x[i,:], mu[j], sigmas[j]))
99 phi[i, j] = multiQuadric(x[i,:], mu[j], sigmas[j])
100 # phi[i, j] = gaussian(x[i,:], mu[j], sigmas[j]))
101 if k % 1000 == 0:
102 percent = true_divide(k, s)*100
103 print(c, ': {:2.2f}%'.format(percent))
104 print(c, ': Done')
105
106 # distributing the work between 4 workers
107 shared_array = Array(c_double, n * neurons)
108 phi = frombuffer(shared_array.get_obj())
109 phi = phi.reshape((n, neurons))
110
111 jobs = []
112 workers = []
113
114 p = n / C
115 m = n % C
116 for c in range(C):
117 jobs.append((c*p, (c+1)*p + (m if c == C-1 else 0)))
118 worker = Process(target = heavy_lifting, args = (c, phi))
119 workers.append(worker)
120 worker.start()
121
122 for worker in workers:
123 worker.join()
124
125 return phi
126
127 def _do_algebra(self, y):
128 phi = self.phi
129
130 w = lstsq(phi, y)[0]
131 os = dot(w, transpose(phi))
132 return w, os
133 # Saving to HDF5
134 os_h5 = os_handle.create_dataset('os', data = os)
135
136 def train(self, x, y):
137 self.n = x.shape[0]
138
139 ## Initialize HDF5 caches
140 prefix = self.prefix
141 postfix = str(self.n) + '-' + str(self.extra_neurons) + '.hdf5'
142 name_template = prefix + '-{}-' + postfix
143 phi_handle = self.phi_handle = File(name_template.format('phi'), 'w')
144 os_handle = self.w_handle = File(name_template.format('os'), 'w')
145 w_handle = self.w_handle = File(name_template.format('w'), 'w')
146 mu_handle = self.mu_handle = File(name_template.format('mu'), 'w')
147 sigma_handle = self.sigma_handle = File(name_template.format('sigma'), 'w')
148
149 ## Mu generation
150 mu = self.mu = self._generate_mu(x)
151 self.neurons = mu.shape[0]
152 print('({} neurons)'.format(self.neurons))
153 # Save to HDF5
154 mu_h5 = mu_handle.create_dataset('mu', data = mu)
155
156 ## Sigma calculation
157 print('Calculating Sigma...')
158 sigmas = self.sigmas = self._calculate_sigmas()
159 # Save to HDF5
160 sigmas_h5 = sigma_handle.create_dataset('sigmas', data = sigmas)
161 print('Done')
162
163 ## Phi calculation
164 print('Calculating Phi...')
165 phi = self.phi = self._calculate_phi(x)
166 print('Done')
167 # Saving to HDF5
168 print('Serializing...')
169 phi_h5 = phi_handle.create_dataset('phi', data = phi)
170 del phi
171 self.phi = phi_h5
172 print('Done')
173
174 ## Algebra
175 print('Doing final algebra...')
176 w, os = self.w, _ = self._do_algebra(y)
177 # Saving to HDF5
178 w_h5 = w_handle.create_dataset('w', data = w)
179 os_h5 = os_handle.create_dataset('os', data = os)
180
181 ## Calculate error
182 self._calculate_error(y)
183 print('Done')
184
185 def predict(self, test_data):
186 mu = self.mu = self.mu.value
187 sigmas = self.sigmas = self.sigmas.value
188 w = self.w = self.w.value
189
190 print('Calculating phi for test data...')
191 phi = self._calculate_phi(test_data)
192 os = dot(w, transpose(phi))
193 savetxt('iok3834.txt', os, delimiter='\n')
194 return os
195
196 @property
197 def summary(self):
198 return '\n'.join( \
199 ['-----------------',
200 'Training set size: {}'.format(self.n),
201 'Hidden layer size: {}'.format(self.neurons),
202 '-----------------',
203 'Absolute error : {:02.2f}'.format(self.error),
204 'Relative error : {:02.2f}%'.format(self.relative_error * 100)])
205
206
207 def predict(test_data):
208 mu = File('rbf-mu-212243-2400.hdf5', 'r')['mu'].value
209 sigmas = File('rbf-sigma-212243-2400.hdf5', 'r')['sigmas'].value
210 w = File('rbf-w-212243-2400.hdf5', 'r')['w'].value
211
212 n = test_data.shape[0]
213 neur = mu.shape[0]
214
215 mu = transpose(mu)
216 mu.reshape((n, neur))
217
218 phi = zeros((n, neur))
219 for i in range(n):
220 for j in range(neur):
221 phi[i, j] = multiQuadric(test_data[i,:], mu[j], sigmas[j])
222
223 os = dot(w, transpose(phi))
224 savetxt('iok3834.txt', os, delimiter='\n')
225 return os
|
|