|
1 | 1 | \d .ml
|
2 | 2 |
|
3 |
| -// Affinity propagation algorithm |
4 |
| -/* data = data points in `value flip` format |
5 |
| -/* df = distance function |
6 |
| -/* dmp = damping coefficient |
7 |
| -/* diag = similarity matrix diagonal value function |
8 |
| -/. r > return list of clusters |
9 |
| -clust.ap:{[data;df;dmp;diag] |
10 |
| - // check distance function and diagonal value |
11 |
| - if[not df in key clust.i.dd;clust.i.err.dd[]]; |
12 |
| - // create initial table with exemplars/matches and similarity, availability and responsibility matrices |
13 |
| - info0:clust.i.apinit["f"$data;df;diag]; |
14 |
| - // run AP algo until there is no change in results over `0.1*count data` runs |
15 |
| - info1:{[maxiter;info]maxiter>info`matches}[.1*count data]clust.i.apalgo[dmp]/info0; |
16 |
| - // return list of clusters |
17 |
| - clust.i.reindex info1`exemplars} |
| 3 | +// Affinity Propagation |
18 | 4 |
|
19 |
| -// Initialize matrices |
20 |
| -/* data = data points in `value flip` format |
21 |
| -/* df = distance function |
22 |
| -/* diag = similarity matrix diagonal value |
23 |
| -/. r > returns a dictionary with similarity, availability and responsibility matrices |
24 |
| -/ and keys for matches and exemplars to be filled during further iterations |
25 |
| -clust.i.apinit:{[data;df;diag] |
26 |
| - // calculate similarity matrix values |
27 |
| - s:@[;;:;diag raze s]'[s:clust.i.dists[data;df;data]each k;k:til n:count data 0]; |
28 |
| - // create lists/matrices of zeros for other variables |
29 |
| - `matches`exemplars`s`a`r!(0;0#0;s),(2;n;n)#0f} |
| 5 | +// @kind function |
| 6 | +// @category clust |
| 7 | +// @fileoverview Fit affinity propagation algorithm |
| 8 | +// @param data {float[][]} Data in matrix format, each column is an individual datapoint |
| 9 | +// @param df {symbol} Distance function name within '.ml.clust.df' |
| 10 | +// @param dmp {float} Damping coefficient |
| 11 | +// @param diag {func} Function applied to the similarity matrix diagonal |
| 12 | +// @param iter {dict} Max number of overall iterations and iterations |
| 13 | +// without a change in clusters. (::) can be passed in which case the defaults |
| 14 | +// of (`total`nochange!200 15) will be used |
| 15 | +// @return {dict} Data, input variables, clusters and exemplars |
| 16 | +// (`data`inputs`clt`exemplars) required for the predict method |
| 17 | +clust.ap.fit:{[data;df;dmp;diag;iter] |
| 18 | + data:clust.i.floatConversion[data]; |
| 19 | + defaultDict:`run`total`nochange!0 200 15; |
| 20 | + if[iter~(::);iter:()!()]; |
| 21 | + if[99h<>type iter;'"iter must be (::) or a dictionary"]; |
| 22 | + // update iteration dictionary with user changes |
| 23 | + updDict:defaultDict,iter; |
| 24 | + // cluster data using AP algo |
| 25 | + clust.i.runap[data;df;dmp;diag;til count data 0;updDict] |
| 26 | + } |
30 | 27 |
|
31 |
| -// Run affinity propagation algorithm |
32 |
| -/* dmp = damping coefficient |
33 |
| -/* info = dictionary containing exemplars and matches, similarity, availability and responsibility matrices |
34 |
| -/. r > returns updated info |
| 28 | +// @kind function |
| 29 | +// @category clust |
| 30 | +// @fileoverview Predict clusters using AP config |
| 31 | +// @param data {float[][]} Data in matrix format, each column is an individual datapoint |
| 32 | +// @param cfg {dict} `data`inputs`clt`exemplars returned by clust.ap.fit |
| 33 | +// @return {long[]} List of predicted clusters |
| 34 | +clust.ap.predict:{[data;cfg] |
| 35 | + data:clust.i.floatConversion[data]; |
| 36 | + if[-1~first cfg`clt; |
| 37 | + '"'.ml.clust.ap.fit' did not converge, all clusters returned -1. Cannot predict new data."]; |
| 38 | + // retrieve cluster centres from training data |
| 39 | + ex:cfg[`data][;distinct cfg`exemplars]; |
| 40 | + // predict testing data clusters |
| 41 | + clust.i.appreddist[ex;cfg[`inputs]`df]each$[0h=type data;flip;enlist]data |
| 42 | + } |
| 43 | + |
| 44 | + |
| 45 | +// Utilities |
| 46 | + |
| 47 | +// @kind function |
| 48 | +// @category private |
| 49 | +// @fileoverview Run affinity propagation algorithm |
| 50 | +// @param data {float[][]} Data in matrix format, each column is an individual datapoint |
| 51 | +// @param df {symbol} Distance function name within '.ml.clust.df' |
| 52 | +// @param dmp {float} Damping coefficient |
| 53 | +// @param diag {func} Function applied to the similarity matrix diagonal |
| 54 | +// @param idxs {long[]} List of indicies to find distances for |
| 55 | +// @param iter {dict} Max number of overall iterations and iterations |
| 56 | +// without a change in clusters. (::) can be passed in where the defaults |
| 57 | +// of (`total`nochange!200 15) will be used |
| 58 | +// @return {long[]} List of clusters |
| 59 | +clust.i.runap:{[data;df;dmp;diag;idxs;iter] |
| 60 | + // check negative euclidean distance has been given |
| 61 | + if[df<>`nege2dist;clust.i.err.ap[]]; |
| 62 | + // calculate distances, availability and responsibility |
| 63 | + info0:clust.i.apinit[data;df;diag;idxs]; |
| 64 | + // initialize exemplar matrix and convergence boolean |
| 65 | + info0,:`emat`conv`iter!((count data 0;iter`nochange)#0b;0b;iter); |
| 66 | + // run ap algo until maximum number of iterations completed or convergence |
| 67 | + info1:clust.i.apstop clust.i.apalgo[dmp]/info0; |
| 68 | + // return data, inputs, clusters and exemplars |
| 69 | + inputs:`df`dmp`diag`iter!(df;dmp;diag;iter); |
| 70 | + exemplars:info1`exemplars; |
| 71 | + clt:$[info1`conv;clust.i.reindex exemplars;count[data 0]#-1]; |
| 72 | + `data`inputs`clt`exemplars!(data;inputs;clt;exemplars) |
| 73 | + } |
| 74 | + |
| 75 | +// @kind function |
| 76 | +// @category private |
| 77 | +// @fileoverview Initialize matrices |
| 78 | +// @param data {float[][]} Data in matrix format, each column is an individual datapoint |
| 79 | +// @param df {symbol} Distance function name within '.ml.clust.df' |
| 80 | +// @param diag {func} Function applied to the similarity matrix diagonal |
| 81 | +// @param idxs {long[]} List of point indices |
| 82 | +// @return {dict} Similarity, availability and responsibility matrices |
| 83 | +// and keys for matches and exemplars to be filled during further iterations |
| 84 | +clust.i.apinit:{[data;df;diag;idxs] |
| 85 | + // calculate similarity matrix values |
| 86 | + s:clust.i.dists[data;df;data]each idxs; |
| 87 | + // update diagonal |
| 88 | + s:@[;;:;diag raze s]'[s;k:til n:count data 0]; |
| 89 | + // create lists/matrices of zeros for other variables |
| 90 | + `matches`exemplars`s`a`r!(0;0#0;s),(2;n;n)#0f |
| 91 | + } |
| 92 | + |
| 93 | +// @kind function |
| 94 | +// @category private |
| 95 | +// @fileoverview Run affinity propagation algorithm |
| 96 | +// @param dmp {float} Damping coefficient |
| 97 | +// @param info {dict} Similarity, availability, responsibility, exemplars, |
| 98 | +// matches, iter dictionary, no_conv boolean and iter dict |
| 99 | +// @return {dict} Updated inputs |
35 | 100 | clust.i.apalgo:{[dmp;info]
|
36 |
| - // update responsibility matrix |
37 |
| - info[`r]:clust.i.updr[dmp;info]; |
38 |
| - // update availability matrix |
39 |
| - info[`a]:clust.i.upda[dmp;info]; |
40 |
| - // find new exemplars |
41 |
| - ex:imax each sum info`a`r; |
42 |
| - // return updated `info` with new exemplars/matches |
43 |
| - update exemplars:ex,matches:?[exemplars~ex;matches+1;0]from info} |
| 101 | + // update responsibility matrix |
| 102 | + info[`r]:clust.i.updr[dmp;info]; |
| 103 | + // update availability matrix |
| 104 | + info[`a]:clust.i.upda[dmp;info]; |
| 105 | + // find new exemplars |
| 106 | + ex:imax each sum info`a`r; |
| 107 | + // update `info` with new exemplars/matches |
| 108 | + info:update exemplars:ex,matches:?[exemplars~ex;matches+1;0]from info; |
| 109 | + // update iter dictionary |
| 110 | + .[clust.i.apconv info;(`iter;`run);+[1]] |
| 111 | + } |
44 | 112 |
|
45 |
| -// Update responsibility matrix |
46 |
| -/* dmp = damping coefficient |
47 |
| -/* info = dictionary containing exemplars and matches, similarity, availability and responsibility matrices |
48 |
| -/. r > returns updated responsibility matrix |
| 113 | +// @kind function |
| 114 | +// @category private |
| 115 | +// @fileoverview Check affinity propagation algorithm for convergence |
| 116 | +// @param info {dict} Similarity, availability, responsibility, exemplars, |
| 117 | +// matches, iter dictionary, no_conv boolean and iter dict |
| 118 | +// @return {dict} Updated info dictionary |
| 119 | +clust.i.apconv:{[info] |
| 120 | + // iteration dictionary |
| 121 | + iter:info`iter; |
| 122 | + // exemplar matrix |
| 123 | + emat:info`emat; |
| 124 | + // existing exemplars |
| 125 | + ediag:0<sum clust.i.diag each info`a`r; |
| 126 | + emat[;iter[`run]mod iter`nochange]:ediag; |
| 127 | + // check for convergence |
| 128 | + if[iter[`nochange]<=iter`run; |
| 129 | + unconv:count[info`s]<>sum(se=iter`nochange)+0=se:sum each emat; |
| 130 | + conv:$[(iter[`total]=iter`run)|not[unconv]&sum[ediag]>0;1b;0b]]; |
| 131 | + // return updated info |
| 132 | + info,`emat`conv!(emat;conv) |
| 133 | + } |
| 134 | + |
| 135 | +// @kind function |
| 136 | +// @category private |
| 137 | +// @fileoverview Retrieve diagonal from a square matrix |
| 138 | +// @param m {any[][]} Square matrix |
| 139 | +// @return {any[]} Matrix diagonal |
| 140 | +clust.i.diag:{[m] |
| 141 | + {x y}'[m;til count m] |
| 142 | + } |
| 143 | + |
| 144 | +// @kind function |
| 145 | +// @category private |
| 146 | +// @fileoverview Update responsibility matrix |
| 147 | +// @param dmp {float} Damping coefficient |
| 148 | +// @param info {dict} Similarity, availability, responsibility, exemplars, |
| 149 | +// matches, iter dictionary, no_conv boolean and iter dict |
| 150 | +// @return {float[][]} Updated responsibility matrix |
49 | 151 | clust.i.updr:{[dmp;info]
|
50 |
| - // create matrix with every points max responsibility - diagonal becomes -inf, current max is becomes second max |
51 |
| - mx:{[x;i]@[count[x]#mx;j;:;]max@[x;i,j:x?mx:max x;:;-0w]}'[sum info`s`a;til count info`r]; |
52 |
| - // calculate new responsibility |
53 |
| - (dmp*info`r)+(1-dmp)*info[`s]-mx} |
| 152 | + // create matrix with every points max responsibility |
| 153 | + // diagonal becomes -inf, current max is becomes second max |
| 154 | + mxresp:{[x;i]@[count[x]#mx;j;:;]max@[x;i,j:x?mx:max x;:;-0w]}; |
| 155 | + mx:mxresp'[sum info`s`a;til count info`r]; |
| 156 | + // calculate new responsibility |
| 157 | + (dmp*info`r)+(1-dmp)*info[`s]-mx |
| 158 | + } |
54 | 159 |
|
55 |
| -// Update availability matrix |
56 |
| -/* dmp = damping coefficient |
57 |
| -/* info = dictionary containing exemplars and matches, similarity, availability and responsibility matrices |
58 |
| -/. r > returns updated availability matrix |
| 160 | +// @kind function |
| 161 | +// @category private |
| 162 | +// @fileoverview Update availability matrix |
| 163 | +// @param dmp {float} Damping coefficient |
| 164 | +// @param info {dict} Similarity, availability, responsibility, exemplars, |
| 165 | +// matches, iter dictionary, no_conv boolean and iter dict |
| 166 | +// @return {float[][]} Returns updated availability matrix |
59 | 167 | clust.i.upda:{[dmp;info]
|
60 |
| - // sum values in positive availability matrix |
61 |
| - s:sum@[;;:;0f]'[pv:0|info`r;k:til n:count info`a]; |
62 |
| - // create a matrix using the negative values produced by the availability sum + responsibility diagonal - positive availability values |
63 |
| - a:@[;;:;]'[0&(s+info[`r]@'k)-/:pv;k;s]; |
64 |
| - // calculate new availability |
65 |
| - (dmp*info`a)+a*1-dmp} |
| 168 | + // sum values in positive availability matrix |
| 169 | + s:sum@[;;:;0f]'[pv:0|info`r;k:til n:count info`a]; |
| 170 | + // create a matrix using the negative values produced by the availability sum |
| 171 | + // + responsibility diagonal - positive availability values |
| 172 | + a:@[;;:;]'[0&(s+info[`r]@'k)-/:pv;k;s]; |
| 173 | + // calculate new availability |
| 174 | + (dmp*info`a)+a*1-dmp |
| 175 | + } |
| 176 | + |
| 177 | +// @kind function |
| 178 | +// @category private |
| 179 | +// @fileoverview Stopping condition for affinity propagation algorithm |
| 180 | +// @param info {dict} Similarity, availability, responsibility, exemplars, |
| 181 | +// matches, iter dictionary, no_conv boolean and iter dict |
| 182 | +// @return {bool} Indicates whether to continue or stop running AP (1/0b) |
| 183 | +clust.i.apstop:{[info] |
| 184 | + (info[`iter;`total]>info[`iter]`run)¬ 1b~info`conv |
| 185 | + } |
| 186 | + |
| 187 | +// @kind function |
| 188 | +// @category private |
| 189 | +// @fileoverview Predict clusters using AP training exemplars |
| 190 | +// @param ex {float[][]} Training cluster centres in matrix format, |
| 191 | +// each column is an individual datapoint |
| 192 | +// @param df {symbol} Distance function name within '.ml.clust.df' |
| 193 | +// @param pt {float[]} Current data point |
| 194 | +// @return {long[]} Predicted clusters |
| 195 | +clust.i.appreddist:{[ex;df;pt] |
| 196 | + d?max d:clust.i.dists[ex;df;pt]each til count ex 0 |
| 197 | + } |
0 commit comments