Skip to content

Commit 6ac12c7

Browse files
authored
Memory optimizations
* Row identifier removed from distance structure. * Object identifiers in complete linkage stored as 32-bit integers. * For some clustering algorithms only edges are stored (without distances).
1 parent 5d34c00 commit 6ac12c7

24 files changed

+1713
-1486
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,6 @@
88
/libs/winflexbison
99
/src/clusty/x64/Debug/clusty.tlog
1010
/src/clusty/x64/Debug
11-
/src
11+
/src/clusty/
12+
13+
/src/clusty.vcxproj.user

makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,10 @@ OBJS := \
101101
$(MIMALLOC_OBJ) \
102102
$(MAIN_DIR)/console.o \
103103
$(MAIN_DIR)/conversion.o \
104-
$(MAIN_DIR)/distances.o \
104+
$(MAIN_DIR)/graph.o \
105105
$(MAIN_DIR)/log.o \
106106
$(MAIN_DIR)/main.o \
107+
$(MAIN_DIR)/params.o \
107108

108109
%.o: %.cpp igraph
109110
$(CXX) $(CFLAGS) -c $< -o $@

src/cd_hit.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@
1717
#include <vector>
1818
#include <unordered_map>
1919

20-
template <class DistanceMatrix>
21-
class CdHit : public IClustering<DistanceMatrix> {
20+
template <class Distance>
21+
class CdHit : public IClustering<Distance> {
2222
public:
2323

2424
int operator()(
25-
const DistanceMatrix& distances,
25+
SparseMatrix<Distance>& distances,
2626
const std::vector<int>& objects,
2727
double threshold,
2828
std::vector<int>& assignments) override {
@@ -41,10 +41,10 @@ class CdHit : public IClustering<DistanceMatrix> {
4141
assignments[obj] = cluster_id;
4242

4343
// iterate over connected object and assign those which are unassigned
44-
for (const dist_t* edge = distances.begin(obj); edge < distances.end(obj); ++edge) {
45-
int other = edge->u.s.hi;
44+
for (const Distance* edge = distances.begin(obj); edge < distances.end(obj); ++edge) {
45+
int other = edge->get_id();
4646

47-
if (edge->d <= threshold && assignments[other] == -1) {
47+
if (edge->get_d() <= threshold && assignments[other] == -1) {
4848
assignments[other] = cluster_id;
4949
}
5050
}

src/clustering.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@
1010

1111
#include <vector>
1212
#include <numeric>
13+
#include <algorithm>
1314

14-
#include "distances.h"
15+
#include "sparse_matrix.h"
1516

1617
struct node_t {
1718
int first = -1;
@@ -23,12 +24,12 @@ struct node_t {
2324
: first(first), second(second), distance(distance) {}
2425
};
2526

26-
template <class DistanceMatrix>
27+
template <class Distance>
2728
class IClustering {
2829
public:
2930

3031
virtual int operator()(
31-
const DistanceMatrix& distances,
32+
SparseMatrix<Distance>& distances,
3233
const std::vector<int>& objects,
3334
double threshold,
3435
std::vector<int>& assignments) = 0;
@@ -37,12 +38,12 @@ class IClustering {
3738

3839
};
3940

40-
template <class DistanceMatrix>
41-
class HierarchicalClustering : public IClustering<DistanceMatrix> {
41+
template <class Distance>
42+
class HierarchicalClustering : public IClustering<Distance> {
4243
protected:
4344

4445
void makeDendrogram(
45-
const std::vector<dist_t>& lambda,
46+
const std::vector<Distance>& lambda,
4647
const std::vector<int>& pi,
4748
std::vector<node_t>& dendrogram)
4849
{
@@ -51,7 +52,7 @@ class HierarchicalClustering : public IClustering<DistanceMatrix> {
5152
std::vector<int> elements(n_objects - 1);
5253
std::iota(elements.begin(), elements.end(), 0);
5354

54-
stable_sort(elements.begin(), elements.end(), [&lambda](int x, int y) {
55+
std::stable_sort(elements.begin(), elements.end(), [&lambda](int x, int y) {
5556
return lambda[x] < lambda[y];
5657
});
5758

@@ -64,7 +65,7 @@ class HierarchicalClustering : public IClustering<DistanceMatrix> {
6465
for (int i = 0; i < n_objects - 1; ++i) {
6566
int j = elements[i];
6667
int next = pi[j];
67-
dendrogram.emplace_back(index[j], index[next], lambda[j].d);
68+
dendrogram.emplace_back(index[j], index[next], lambda[j].get_d());
6869
index[next] = n_objects + i;
6970
}
7071
}

src/clusty.vcxproj

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@
133133
<PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
134134
<ConformanceMode>true</ConformanceMode>
135135
<BufferSecurityCheck>false</BufferSecurityCheck>
136-
<LanguageStandard>stdcpp17</LanguageStandard>
136+
<LanguageStandard>stdcpplatest</LanguageStandard>
137137
<MultiProcessorCompilation>true</MultiProcessorCompilation>
138138
</ClCompile>
139139
<Link>
@@ -151,7 +151,7 @@
151151
<PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
152152
<ConformanceMode>true</ConformanceMode>
153153
<BufferSecurityCheck>false</BufferSecurityCheck>
154-
<LanguageStandard>stdcpp17</LanguageStandard>
154+
<LanguageStandard>stdcpplatest</LanguageStandard>
155155
</ClCompile>
156156
<Link>
157157
<SubSystem>Console</SubSystem>
@@ -165,9 +165,10 @@
165165
<ClCompile Include="..\libs\mimalloc\src\static.c" />
166166
<ClCompile Include="console.cpp" />
167167
<ClCompile Include="conversion.cpp" />
168-
<ClCompile Include="distances.cpp" />
168+
<ClCompile Include="graph.cpp" />
169169
<ClCompile Include="log.cpp" />
170170
<ClCompile Include="main.cpp" />
171+
<ClCompile Include="params.cpp" />
171172
</ItemGroup>
172173
<ItemGroup>
173174
<ClInclude Include="cd_hit.h" />
@@ -179,8 +180,13 @@
179180
<ClInclude Include="log.h" />
180181
<ClInclude Include="memory_monotonic.h" />
181182
<ClInclude Include="distances.h" />
183+
<ClInclude Include="graph.h" />
184+
<ClInclude Include="params.h" />
182185
<ClInclude Include="set_cover.h" />
183186
<ClInclude Include="single_bfs.h" />
187+
<ClInclude Include="sparse_matrix.h" />
188+
<ClInclude Include="graph_named.h" />
189+
<ClInclude Include="graph_numbered.h" />
184190
<ClInclude Include="uclust.h" />
185191
<ClInclude Include="utils.h" />
186192
<ClInclude Include="version.h" />

src/clusty.vcxproj.filters

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,12 @@
1313
<ClCompile Include="main.cpp" />
1414
<ClCompile Include="console.cpp" />
1515
<ClCompile Include="conversion.cpp" />
16-
<ClCompile Include="distances.cpp" />
16+
<ClCompile Include="graph.cpp" />
1717
<ClCompile Include="..\libs\mimalloc\src\static.c">
1818
<Filter>Library Files</Filter>
1919
</ClCompile>
2020
<ClCompile Include="log.cpp" />
21+
<ClCompile Include="params.cpp" />
2122
</ItemGroup>
2223
<ItemGroup>
2324
<ClInclude Include="distances.h" />
@@ -34,5 +35,10 @@
3435
<ClInclude Include="set_cover.h" />
3536
<ClInclude Include="single_bfs.h" />
3637
<ClInclude Include="log.h" />
38+
<ClInclude Include="graph_named.h" />
39+
<ClInclude Include="graph_numbered.h" />
40+
<ClInclude Include="sparse_matrix.h" />
41+
<ClInclude Include="graph.h" />
42+
<ClInclude Include="params.h" />
3743
</ItemGroup>
3844
</Project>

0 commit comments

Comments
 (0)