Skip to content

Commit 41c66e7

Browse files
authored
Merge pull request #82 from dmorgankx/cluster_streaming
Addition of clustering fit, predict and update methods
2 parents 0628728 + 726c30c commit 41c66e7

File tree

12 files changed

+1699
-697
lines changed

12 files changed

+1699
-697
lines changed

clust/aprop.q

Lines changed: 188 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,197 @@
11
\d .ml
22

3-
// Affinity propagation algorithm
4-
/* data = data points in `value flip` format
5-
/* df = distance function
6-
/* dmp = damping coefficient
7-
/* diag = similarity matrix diagonal value function
8-
/. r > return list of clusters
9-
clust.ap:{[data;df;dmp;diag]
10-
// check distance function and diagonal value
11-
if[not df in key clust.i.dd;clust.i.err.dd[]];
12-
// create initial table with exemplars/matches and similarity, availability and responsibility matrices
13-
info0:clust.i.apinit["f"$data;df;diag];
14-
// run AP algo until there is no change in results over `0.1*count data` runs
15-
info1:{[maxiter;info]maxiter>info`matches}[.1*count data]clust.i.apalgo[dmp]/info0;
16-
// return list of clusters
17-
clust.i.reindex info1`exemplars}
3+
// Affinity Propagation
184

19-
// Initialize matrices
20-
/* data = data points in `value flip` format
21-
/* df = distance function
22-
/* diag = similarity matrix diagonal value
23-
/. r > returns a dictionary with similarity, availability and responsibility matrices
24-
/ and keys for matches and exemplars to be filled during further iterations
25-
clust.i.apinit:{[data;df;diag]
26-
// calculate similarity matrix values
27-
s:@[;;:;diag raze s]'[s:clust.i.dists[data;df;data]each k;k:til n:count data 0];
28-
// create lists/matrices of zeros for other variables
29-
`matches`exemplars`s`a`r!(0;0#0;s),(2;n;n)#0f}
5+
// @kind function
6+
// @category clust
7+
// @fileoverview Fit affinity propagation algorithm
8+
// @param data {float[][]} Data in matrix format, each column is an individual datapoint
9+
// @param df {symbol} Distance function name within '.ml.clust.df'
10+
// @param dmp {float} Damping coefficient
11+
// @param diag {func} Function applied to the similarity matrix diagonal
12+
// @param iter {dict} Max number of overall iterations and iterations
13+
// without a change in clusters. (::) can be passed in which case the defaults
14+
// of (`total`nochange!200 15) will be used
15+
// @return {dict} Data, input variables, clusters and exemplars
16+
// (`data`inputs`clt`exemplars) required for the predict method
17+
clust.ap.fit:{[data;df;dmp;diag;iter]
18+
data:clust.i.floatConversion[data];
19+
defaultDict:`run`total`nochange!0 200 15;
20+
if[iter~(::);iter:()!()];
21+
if[99h<>type iter;'"iter must be (::) or a dictionary"];
22+
// update iteration dictionary with user changes
23+
updDict:defaultDict,iter;
24+
// cluster data using AP algo
25+
clust.i.runap[data;df;dmp;diag;til count data 0;updDict]
26+
}
3027

31-
// Run affinity propagation algorithm
32-
/* dmp = damping coefficient
33-
/* info = dictionary containing exemplars and matches, similarity, availability and responsibility matrices
34-
/. r > returns updated info
28+
// @kind function
29+
// @category clust
30+
// @fileoverview Predict clusters using AP config
31+
// @param data {float[][]} Data in matrix format, each column is an individual datapoint
32+
// @param cfg {dict} `data`inputs`clt`exemplars returned by clust.ap.fit
33+
// @return {long[]} List of predicted clusters
34+
clust.ap.predict:{[data;cfg]
35+
data:clust.i.floatConversion[data];
36+
if[-1~first cfg`clt;
37+
'"'.ml.clust.ap.fit' did not converge, all clusters returned -1. Cannot predict new data."];
38+
// retrieve cluster centres from training data
39+
ex:cfg[`data][;distinct cfg`exemplars];
40+
// predict testing data clusters
41+
clust.i.appreddist[ex;cfg[`inputs]`df]each$[0h=type data;flip;enlist]data
42+
}
43+
44+
45+
// Utilities
46+
47+
// @kind function
48+
// @category private
49+
// @fileoverview Run affinity propagation algorithm
50+
// @param data {float[][]} Data in matrix format, each column is an individual datapoint
51+
// @param df {symbol} Distance function name within '.ml.clust.df'
52+
// @param dmp {float} Damping coefficient
53+
// @param diag {func} Function applied to the similarity matrix diagonal
54+
// @param idxs {long[]} List of indicies to find distances for
55+
// @param iter {dict} Max number of overall iterations and iterations
56+
// without a change in clusters. (::) can be passed in where the defaults
57+
// of (`total`nochange!200 15) will be used
58+
// @return {long[]} List of clusters
59+
clust.i.runap:{[data;df;dmp;diag;idxs;iter]
60+
// check negative euclidean distance has been given
61+
if[df<>`nege2dist;clust.i.err.ap[]];
62+
// calculate distances, availability and responsibility
63+
info0:clust.i.apinit[data;df;diag;idxs];
64+
// initialize exemplar matrix and convergence boolean
65+
info0,:`emat`conv`iter!((count data 0;iter`nochange)#0b;0b;iter);
66+
// run ap algo until maximum number of iterations completed or convergence
67+
info1:clust.i.apstop clust.i.apalgo[dmp]/info0;
68+
// return data, inputs, clusters and exemplars
69+
inputs:`df`dmp`diag`iter!(df;dmp;diag;iter);
70+
exemplars:info1`exemplars;
71+
clt:$[info1`conv;clust.i.reindex exemplars;count[data 0]#-1];
72+
`data`inputs`clt`exemplars!(data;inputs;clt;exemplars)
73+
}
74+
75+
// @kind function
76+
// @category private
77+
// @fileoverview Initialize matrices
78+
// @param data {float[][]} Data in matrix format, each column is an individual datapoint
79+
// @param df {symbol} Distance function name within '.ml.clust.df'
80+
// @param diag {func} Function applied to the similarity matrix diagonal
81+
// @param idxs {long[]} List of point indices
82+
// @return {dict} Similarity, availability and responsibility matrices
83+
// and keys for matches and exemplars to be filled during further iterations
84+
clust.i.apinit:{[data;df;diag;idxs]
85+
// calculate similarity matrix values
86+
s:clust.i.dists[data;df;data]each idxs;
87+
// update diagonal
88+
s:@[;;:;diag raze s]'[s;k:til n:count data 0];
89+
// create lists/matrices of zeros for other variables
90+
`matches`exemplars`s`a`r!(0;0#0;s),(2;n;n)#0f
91+
}
92+
93+
// @kind function
94+
// @category private
95+
// @fileoverview Run affinity propagation algorithm
96+
// @param dmp {float} Damping coefficient
97+
// @param info {dict} Similarity, availability, responsibility, exemplars,
98+
// matches, iter dictionary, no_conv boolean and iter dict
99+
// @return {dict} Updated inputs
35100
clust.i.apalgo:{[dmp;info]
36-
// update responsibility matrix
37-
info[`r]:clust.i.updr[dmp;info];
38-
// update availability matrix
39-
info[`a]:clust.i.upda[dmp;info];
40-
// find new exemplars
41-
ex:imax each sum info`a`r;
42-
// return updated `info` with new exemplars/matches
43-
update exemplars:ex,matches:?[exemplars~ex;matches+1;0]from info}
101+
// update responsibility matrix
102+
info[`r]:clust.i.updr[dmp;info];
103+
// update availability matrix
104+
info[`a]:clust.i.upda[dmp;info];
105+
// find new exemplars
106+
ex:imax each sum info`a`r;
107+
// update `info` with new exemplars/matches
108+
info:update exemplars:ex,matches:?[exemplars~ex;matches+1;0]from info;
109+
// update iter dictionary
110+
.[clust.i.apconv info;(`iter;`run);+[1]]
111+
}
44112

45-
// Update responsibility matrix
46-
/* dmp = damping coefficient
47-
/* info = dictionary containing exemplars and matches, similarity, availability and responsibility matrices
48-
/. r > returns updated responsibility matrix
113+
// @kind function
114+
// @category private
115+
// @fileoverview Check affinity propagation algorithm for convergence
116+
// @param info {dict} Similarity, availability, responsibility, exemplars,
117+
// matches, iter dictionary, no_conv boolean and iter dict
118+
// @return {dict} Updated info dictionary
119+
clust.i.apconv:{[info]
120+
// iteration dictionary
121+
iter:info`iter;
122+
// exemplar matrix
123+
emat:info`emat;
124+
// existing exemplars
125+
ediag:0<sum clust.i.diag each info`a`r;
126+
emat[;iter[`run]mod iter`nochange]:ediag;
127+
// check for convergence
128+
if[iter[`nochange]<=iter`run;
129+
unconv:count[info`s]<>sum(se=iter`nochange)+0=se:sum each emat;
130+
conv:$[(iter[`total]=iter`run)|not[unconv]&sum[ediag]>0;1b;0b]];
131+
// return updated info
132+
info,`emat`conv!(emat;conv)
133+
}
134+
135+
// @kind function
136+
// @category private
137+
// @fileoverview Retrieve diagonal from a square matrix
138+
// @param m {any[][]} Square matrix
139+
// @return {any[]} Matrix diagonal
140+
clust.i.diag:{[m]
141+
{x y}'[m;til count m]
142+
}
143+
144+
// @kind function
145+
// @category private
146+
// @fileoverview Update responsibility matrix
147+
// @param dmp {float} Damping coefficient
148+
// @param info {dict} Similarity, availability, responsibility, exemplars,
149+
// matches, iter dictionary, no_conv boolean and iter dict
150+
// @return {float[][]} Updated responsibility matrix
49151
clust.i.updr:{[dmp;info]
50-
// create matrix with every points max responsibility - diagonal becomes -inf, current max is becomes second max
51-
mx:{[x;i]@[count[x]#mx;j;:;]max@[x;i,j:x?mx:max x;:;-0w]}'[sum info`s`a;til count info`r];
52-
// calculate new responsibility
53-
(dmp*info`r)+(1-dmp)*info[`s]-mx}
152+
// create matrix with every points max responsibility
153+
// diagonal becomes -inf, current max is becomes second max
154+
mxresp:{[x;i]@[count[x]#mx;j;:;]max@[x;i,j:x?mx:max x;:;-0w]};
155+
mx:mxresp'[sum info`s`a;til count info`r];
156+
// calculate new responsibility
157+
(dmp*info`r)+(1-dmp)*info[`s]-mx
158+
}
54159

55-
// Update availability matrix
56-
/* dmp = damping coefficient
57-
/* info = dictionary containing exemplars and matches, similarity, availability and responsibility matrices
58-
/. r > returns updated availability matrix
160+
// @kind function
161+
// @category private
162+
// @fileoverview Update availability matrix
163+
// @param dmp {float} Damping coefficient
164+
// @param info {dict} Similarity, availability, responsibility, exemplars,
165+
// matches, iter dictionary, no_conv boolean and iter dict
166+
// @return {float[][]} Returns updated availability matrix
59167
clust.i.upda:{[dmp;info]
60-
// sum values in positive availability matrix
61-
s:sum@[;;:;0f]'[pv:0|info`r;k:til n:count info`a];
62-
// create a matrix using the negative values produced by the availability sum + responsibility diagonal - positive availability values
63-
a:@[;;:;]'[0&(s+info[`r]@'k)-/:pv;k;s];
64-
// calculate new availability
65-
(dmp*info`a)+a*1-dmp}
168+
// sum values in positive availability matrix
169+
s:sum@[;;:;0f]'[pv:0|info`r;k:til n:count info`a];
170+
// create a matrix using the negative values produced by the availability sum
171+
// + responsibility diagonal - positive availability values
172+
a:@[;;:;]'[0&(s+info[`r]@'k)-/:pv;k;s];
173+
// calculate new availability
174+
(dmp*info`a)+a*1-dmp
175+
}
176+
177+
// @kind function
178+
// @category private
179+
// @fileoverview Stopping condition for affinity propagation algorithm
180+
// @param info {dict} Similarity, availability, responsibility, exemplars,
181+
// matches, iter dictionary, no_conv boolean and iter dict
182+
// @return {bool} Indicates whether to continue or stop running AP (1/0b)
183+
clust.i.apstop:{[info]
184+
(info[`iter;`total]>info[`iter]`run)&not 1b~info`conv
185+
}
186+
187+
// @kind function
188+
// @category private
189+
// @fileoverview Predict clusters using AP training exemplars
190+
// @param ex {float[][]} Training cluster centres in matrix format,
191+
// each column is an individual datapoint
192+
// @param df {symbol} Distance function name within '.ml.clust.df'
193+
// @param pt {float[]} Current data point
194+
// @return {long[]} Predicted clusters
195+
clust.i.appreddist:{[ex;df;pt]
196+
d?max d:clust.i.dists[ex;df;pt]each til count ex 0
197+
}

0 commit comments

Comments
 (0)