|
1 |
| -\d .ml |
| 1 | +// clust/init.q - Affinity propagation |
| 2 | +// Copyright (c) 2021 Kx Systems Inc |
| 3 | +// |
| 4 | +// Clustering using affinity propagation. |
| 5 | +// Affinity Propagation groups data based on the similarity |
| 6 | +// between points and subsequently finds exemplars, which best |
| 7 | +// represent the points in each cluster. The algorithm does |
| 8 | +// not require the number of clusters be provided at run time, |
| 9 | +// but determines the optimum solution by exchanging real-valued |
| 10 | +// messages between points until a high-valued set of exemplars |
| 11 | +// is produced. |
2 | 12 |
|
3 |
| -// Affinity Propagation |
| 13 | +\d .ml |
4 | 14 |
|
5 | 15 | // @kind function
|
6 | 16 | // @category clust
|
7 |
| -// @fileoverview Fit affinity propagation algorithm |
8 |
| -// @param data {float[][]} Data in matrix format, each column is an individual datapoint |
9 |
| -// @param df {symbol} Distance function name within '.ml.clust.df' |
10 |
| -// @param dmp {float} Damping coefficient |
11 |
| -// @param diag {func} Function applied to the similarity matrix diagonal |
12 |
| -// @param iter {dict} Max number of overall iterations and iterations |
13 |
| -// without a change in clusters. (::) can be passed in which case the defaults |
14 |
| -// of (`total`nochange!200 15) will be used |
15 |
| -// @return {dict} Data, input variables, clusters and exemplars |
16 |
| -// (`data`inputs`clt`exemplars) required for the predict method |
17 |
| -clust.ap.fit:{[data;df;dmp;diag;iter] |
| 17 | +// @desc Fit affinity propagation algorithm |
| 18 | +// @param data {float[][]} Each column of the data is an individual datapoint |
| 19 | +// @param df {symbol} Distance function name within '.ml.clust.df' |
| 20 | +// @param damp {float} Damping coefficient |
| 21 | +// @param diag {fn} Function applied to the similarity matrix diagonal |
| 22 | +// @param iter {dictionary} Max number of overall iterations and iterations |
| 23 | +// without a change in clusters. (::) can be passed in which case the |
| 24 | +// defaults of (`total`noChange!200 15) will be used |
| 25 | +// @return {dictionary} Data, input variables, clusters and exemplars |
| 26 | +// (`data`inputs`clust`exemplars) required, along with a projection of the |
| 27 | +// predict function |
| 28 | +clust.ap.fit:{[data;df;damp;diag;iter] |
18 | 29 | data:clust.i.floatConversion[data];
|
19 |
| - defaultDict:`run`total`nochange!0 200 15; |
| 30 | + defaultDict:`run`total`noChange!0 200 15; |
20 | 31 | if[iter~(::);iter:()!()];
|
21 | 32 | if[99h<>type iter;'"iter must be (::) or a dictionary"];
|
22 |
| - // update iteration dictionary with user changes |
| 33 | + // Update iteration dictionary with user changes |
23 | 34 | updDict:defaultDict,iter;
|
24 |
| - // cluster data using AP algo |
25 |
| - clust.i.runap[data;df;dmp;diag;til count data 0;updDict] |
| 35 | + // Cluster data using AP algo |
| 36 | + modelInfo:clust.i.runAp[data;df;damp;diag;til count data 0;updDict]; |
| 37 | + returnInfo:enlist[`modelInfo]!enlist modelInfo; |
| 38 | + predictFunc:clust.ap.predict returnInfo; |
| 39 | + returnInfo,enlist[`predict]!enlist predictFunc |
26 | 40 | }
|
27 | 41 |
|
28 | 42 | // @kind function
|
29 | 43 | // @category clust
|
30 |
| -// @fileoverview Predict clusters using AP config |
31 |
| -// @param data {float[][]} Data in matrix format, each column is an individual datapoint |
32 |
| -// @param cfg {dict} `data`inputs`clt`exemplars returned by clust.ap.fit |
33 |
| -// @return {long[]} List of predicted clusters |
34 |
| -clust.ap.predict:{[data;cfg] |
| 44 | +// @desc Predict clusters using AP config |
| 45 | +// @param config {dictionary} `data`inputs`clust`exemplars returned by the |
| 46 | +// modelInfo key from the return of clust.ap.fit |
| 47 | +// @param data {float[][]} Each column of the data is an individual datapoint |
| 48 | +// @return {long[]} Predicted clusters |
| 49 | +clust.ap.predict:{[config;data] |
| 50 | + config:config`modelInfo; |
35 | 51 | data:clust.i.floatConversion[data];
|
36 |
| - if[-1~first cfg`clt; |
37 |
| - '"'.ml.clust.ap.fit' did not converge, all clusters returned -1. Cannot predict new data."]; |
38 |
| - // retrieve cluster centres from training data |
39 |
| - ex:cfg[`data][;distinct cfg`exemplars]; |
40 |
| - // predict testing data clusters |
41 |
| - clust.i.appreddist[ex;cfg[`inputs]`df]each$[0h=type data;flip;enlist]data |
42 |
| - } |
43 |
| - |
44 |
| - |
45 |
| -// Utilities |
46 |
| - |
47 |
| -// @kind function |
48 |
| -// @category private |
49 |
| -// @fileoverview Run affinity propagation algorithm |
50 |
| -// @param data {float[][]} Data in matrix format, each column is an individual datapoint |
51 |
| -// @param df {symbol} Distance function name within '.ml.clust.df' |
52 |
| -// @param dmp {float} Damping coefficient |
53 |
| -// @param diag {func} Function applied to the similarity matrix diagonal |
54 |
| -// @param idxs {long[]} List of indicies to find distances for |
55 |
| -// @param iter {dict} Max number of overall iterations and iterations |
56 |
| -// without a change in clusters. (::) can be passed in where the defaults |
57 |
| -// of (`total`nochange!200 15) will be used |
58 |
| -// @return {long[]} List of clusters |
59 |
| -clust.i.runap:{[data;df;dmp;diag;idxs;iter] |
60 |
| - // check negative euclidean distance has been given |
61 |
| - if[df<>`nege2dist;clust.i.err.ap[]]; |
62 |
| - // calculate distances, availability and responsibility |
63 |
| - info0:clust.i.apinit[data;df;diag;idxs]; |
64 |
| - // initialize exemplar matrix and convergence boolean |
65 |
| - info0,:`emat`conv`iter!((count data 0;iter`nochange)#0b;0b;iter); |
66 |
| - // run ap algo until maximum number of iterations completed or convergence |
67 |
| - info1:clust.i.apstop clust.i.apalgo[dmp]/info0; |
68 |
| - // return data, inputs, clusters and exemplars |
69 |
| - inputs:`df`dmp`diag`iter!(df;dmp;diag;iter); |
70 |
| - exemplars:info1`exemplars; |
71 |
| - clt:$[info1`conv;clust.i.reindex exemplars;count[data 0]#-1]; |
72 |
| - `data`inputs`clt`exemplars!(data;inputs;clt;exemplars) |
73 |
| - } |
74 |
| - |
75 |
| -// @kind function |
76 |
| -// @category private |
77 |
| -// @fileoverview Initialize matrices |
78 |
| -// @param data {float[][]} Data in matrix format, each column is an individual datapoint |
79 |
| -// @param df {symbol} Distance function name within '.ml.clust.df' |
80 |
| -// @param diag {func} Function applied to the similarity matrix diagonal |
81 |
| -// @param idxs {long[]} List of point indices |
82 |
| -// @return {dict} Similarity, availability and responsibility matrices |
83 |
| -// and keys for matches and exemplars to be filled during further iterations |
84 |
| -clust.i.apinit:{[data;df;diag;idxs] |
85 |
| - // calculate similarity matrix values |
86 |
| - s:clust.i.dists[data;df;data]each idxs; |
87 |
| - // update diagonal |
88 |
| - s:@[;;:;diag raze s]'[s;k:til n:count data 0]; |
89 |
| - // create lists/matrices of zeros for other variables |
90 |
| - `matches`exemplars`s`a`r!(0;0#0;s),(2;n;n)#0f |
91 |
| - } |
92 |
| - |
93 |
| -// @kind function |
94 |
| -// @category private |
95 |
| -// @fileoverview Run affinity propagation algorithm |
96 |
| -// @param dmp {float} Damping coefficient |
97 |
| -// @param info {dict} Similarity, availability, responsibility, exemplars, |
98 |
| -// matches, iter dictionary, no_conv boolean and iter dict |
99 |
| -// @return {dict} Updated inputs |
100 |
| -clust.i.apalgo:{[dmp;info] |
101 |
| - // update responsibility matrix |
102 |
| - info[`r]:clust.i.updr[dmp;info]; |
103 |
| - // update availability matrix |
104 |
| - info[`a]:clust.i.upda[dmp;info]; |
105 |
| - // find new exemplars |
106 |
| - ex:imax each sum info`a`r; |
107 |
| - // update `info` with new exemplars/matches |
108 |
| - info:update exemplars:ex,matches:?[exemplars~ex;matches+1;0]from info; |
109 |
| - // update iter dictionary |
110 |
| - .[clust.i.apconv info;(`iter;`run);+[1]] |
111 |
| - } |
112 |
| - |
113 |
| -// @kind function |
114 |
| -// @category private |
115 |
| -// @fileoverview Check affinity propagation algorithm for convergence |
116 |
| -// @param info {dict} Similarity, availability, responsibility, exemplars, |
117 |
| -// matches, iter dictionary, no_conv boolean and iter dict |
118 |
| -// @return {dict} Updated info dictionary |
119 |
| -clust.i.apconv:{[info] |
120 |
| - // iteration dictionary |
121 |
| - iter:info`iter; |
122 |
| - // exemplar matrix |
123 |
| - emat:info`emat; |
124 |
| - // existing exemplars |
125 |
| - ediag:0<sum clust.i.diag each info`a`r; |
126 |
| - emat[;iter[`run]mod iter`nochange]:ediag; |
127 |
| - // check for convergence |
128 |
| - if[iter[`nochange]<=iter`run; |
129 |
| - unconv:count[info`s]<>sum(se=iter`nochange)+0=se:sum each emat; |
130 |
| - conv:$[(iter[`total]=iter`run)|not[unconv]&sum[ediag]>0;1b;0b]]; |
131 |
| - // return updated info |
132 |
| - info,`emat`conv!(emat;conv) |
133 |
| - } |
134 |
| - |
135 |
| -// @kind function |
136 |
| -// @category private |
137 |
| -// @fileoverview Retrieve diagonal from a square matrix |
138 |
| -// @param m {any[][]} Square matrix |
139 |
| -// @return {any[]} Matrix diagonal |
140 |
| -clust.i.diag:{[m] |
141 |
| - {x y}'[m;til count m] |
142 |
| - } |
143 |
| - |
144 |
| -// @kind function |
145 |
| -// @category private |
146 |
| -// @fileoverview Update responsibility matrix |
147 |
| -// @param dmp {float} Damping coefficient |
148 |
| -// @param info {dict} Similarity, availability, responsibility, exemplars, |
149 |
| -// matches, iter dictionary, no_conv boolean and iter dict |
150 |
| -// @return {float[][]} Updated responsibility matrix |
151 |
| -clust.i.updr:{[dmp;info] |
152 |
| - // create matrix with every points max responsibility |
153 |
| - // diagonal becomes -inf, current max is becomes second max |
154 |
| - mxresp:{[x;i]@[count[x]#mx;j;:;]max@[x;i,j:x?mx:max x;:;-0w]}; |
155 |
| - mx:mxresp'[sum info`s`a;til count info`r]; |
156 |
| - // calculate new responsibility |
157 |
| - (dmp*info`r)+(1-dmp)*info[`s]-mx |
158 |
| - } |
159 |
| - |
160 |
| -// @kind function |
161 |
| -// @category private |
162 |
| -// @fileoverview Update availability matrix |
163 |
| -// @param dmp {float} Damping coefficient |
164 |
| -// @param info {dict} Similarity, availability, responsibility, exemplars, |
165 |
| -// matches, iter dictionary, no_conv boolean and iter dict |
166 |
| -// @return {float[][]} Returns updated availability matrix |
167 |
| -clust.i.upda:{[dmp;info] |
168 |
| - // sum values in positive availability matrix |
169 |
| - s:sum@[;;:;0f]'[pv:0|info`r;k:til n:count info`a]; |
170 |
| - // create a matrix using the negative values produced by the availability sum |
171 |
| - // + responsibility diagonal - positive availability values |
172 |
| - a:@[;;:;]'[0&(s+info[`r]@'k)-/:pv;k;s]; |
173 |
| - // calculate new availability |
174 |
| - (dmp*info`a)+a*1-dmp |
175 |
| - } |
176 |
| - |
177 |
| -// @kind function |
178 |
| -// @category private |
179 |
| -// @fileoverview Stopping condition for affinity propagation algorithm |
180 |
| -// @param info {dict} Similarity, availability, responsibility, exemplars, |
181 |
| -// matches, iter dictionary, no_conv boolean and iter dict |
182 |
| -// @return {bool} Indicates whether to continue or stop running AP (1/0b) |
183 |
| -clust.i.apstop:{[info] |
184 |
| - (info[`iter;`total]>info[`iter]`run)¬ 1b~info`conv |
185 |
| - } |
186 |
| - |
187 |
| -// @kind function |
188 |
| -// @category private |
189 |
| -// @fileoverview Predict clusters using AP training exemplars |
190 |
| -// @param ex {float[][]} Training cluster centres in matrix format, |
191 |
| -// each column is an individual datapoint |
192 |
| -// @param df {symbol} Distance function name within '.ml.clust.df' |
193 |
| -// @param pt {float[]} Current data point |
194 |
| -// @return {long[]} Predicted clusters |
195 |
| -clust.i.appreddist:{[ex;df;pt] |
196 |
| - d?max d:clust.i.dists[ex;df;pt]each til count ex 0 |
| 52 | + if[-1~first config`clust; |
| 53 | + '"'.ml.clust.ap.fit' did not converge, all clusters returned -1.", |
| 54 | + " Cannot predict new data." |
| 55 | + ]; |
| 56 | + // Retrieve cluster centres from training data |
| 57 | + exemp:config[`data][;distinct config`exemplars]; |
| 58 | + // Predict testing data clusters |
| 59 | + data:$[0h=type data;flip;enlist]data; |
| 60 | + clust.i.apPredDist[exemp;config[`inputs]`df]each data |
197 | 61 | }
|
0 commit comments