Skip to content

Commit fab1b19

Browse files
committed
added k to metadata file when dumping to file; renamed num_docs into num_colors everywhere for consistency
1 parent a651de0 commit fab1b19

18 files changed

+195
-186
lines changed

include/GGCAT.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ struct GGCAT {
7474
m_instance->dump_unitigs(m_graph_file, m_k, num_threads, num_threads == 1, callback, true);
7575
}
7676

77-
uint64_t num_docs() const { return m_filenames.size(); }
77+
uint64_t num_colors() const { return m_filenames.size(); }
7878
std::vector<std::string> const& filenames() const { return m_filenames; }
7979

8080
private:

include/build_util.hpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ void build_reference_sketches(index_type const& index,
1313
) {
1414
assert(num_threads > 0);
1515

16-
const uint64_t num_docs = index.num_docs();
16+
const uint64_t num_colors = index.num_colors();
1717
typename sketch::hll_t::HashType hasher;
1818
auto const& u2c = index.get_u2c();
1919
auto const& ccs = index.get_color_sets();
@@ -27,7 +27,7 @@ void build_reference_sketches(index_type const& index,
2727
}
2828

2929
std::vector<std::vector<sketch::hll_t>> thread_sketches(
30-
num_threads, std::vector<sketch::hll_t>(num_docs, sketch::hll_t(p)));
30+
num_threads, std::vector<sketch::hll_t>(num_colors, sketch::hll_t(p)));
3131

3232
struct slice {
3333
uint64_t begin; // start position in u2c
@@ -105,7 +105,7 @@ void build_reference_sketches(index_type const& index,
105105
}
106106
for (uint64_t i = 0; i != size; ++i, ++it) {
107107
uint32_t ref_id = *it;
108-
assert(ref_id < num_docs);
108+
assert(ref_id < num_colors);
109109
for (auto hash : hashes) sketches[ref_id].add(hash);
110110
}
111111
prev_pos = curr_pos + 1;
@@ -122,7 +122,7 @@ void build_reference_sketches(index_type const& index,
122122
}
123123

124124
/* merge sketches into thread_sketches[0] */
125-
for (uint64_t i = 0; i != num_docs; ++i) {
125+
for (uint64_t i = 0; i != num_colors; ++i) {
126126
auto& sketch = thread_sketches[0][i];
127127
for (uint64_t thread_id = 1; thread_id != num_threads; ++thread_id) {
128128
sketch += thread_sketches[thread_id][i];
@@ -133,7 +133,7 @@ void build_reference_sketches(index_type const& index,
133133
if (!out.is_open()) throw std::runtime_error("cannot open file");
134134
const uint64_t num_bytes = 1ULL << p;
135135
out.write(reinterpret_cast<char const*>(&num_bytes), 8);
136-
out.write(reinterpret_cast<char const*>(&num_docs), 8);
136+
out.write(reinterpret_cast<char const*>(&num_colors), 8);
137137
for (auto const& x : thread_sketches[0]) {
138138
assert(x.m() == num_bytes);
139139
assert(x.m() == x.core().size());
@@ -145,18 +145,18 @@ void build_reference_sketches(index_type const& index,
145145

146146
template <typename Iterator>
147147
void build_colors_sketches_sliced(
148-
uint64_t num_docs, uint64_t num_color_sets, function<Iterator(uint64_t)> colors,
148+
uint64_t num_colors, uint64_t num_color_sets, function<Iterator(uint64_t)> colors,
149149
uint64_t p, // use 2^p bytes per HLL sketch
150150
uint64_t num_threads, // num. threads for construction
151151
std::string output_filename, // where the sketches will be serialized
152152
double left, double right) //
153153
{
154154
assert(num_threads > 0);
155155

156-
const double min_size = left * num_docs;
157-
const double max_size = right * num_docs;
156+
const double min_size = left * num_colors;
157+
const double max_size = right * num_colors;
158158
assert(min_size >= 0);
159-
assert(max_size <= num_docs);
159+
assert(max_size <= num_colors);
160160

161161
if (num_color_sets < num_threads) { num_threads = num_color_sets; }
162162

@@ -215,7 +215,7 @@ void build_colors_sketches_sliced(
215215
assert(size > 0);
216216
for (uint64_t i = 0; i < size; ++i, ++it) {
217217
uint64_t ref_id = *it;
218-
assert(ref_id < num_docs);
218+
assert(ref_id < num_colors);
219219
sketches[color_id - s.begin].addh(ref_id);
220220
}
221221
}
@@ -233,7 +233,7 @@ void build_colors_sketches_sliced(
233233
if (!out.is_open()) throw std::runtime_error("cannot open file");
234234
const uint64_t num_bytes = 1ULL << p;
235235
out.write(reinterpret_cast<char const*>(&num_bytes), 8);
236-
out.write(reinterpret_cast<char const*>(&num_docs), 8);
236+
out.write(reinterpret_cast<char const*>(&num_colors), 8);
237237
out.write(reinterpret_cast<char const*>(&partition_size), 8);
238238
for (auto const color_id : filtered_colors_ids) {
239239
out.write(reinterpret_cast<char const*>(&color_id), 8);

include/builder.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ struct index<ColorSets>::builder {
2020
essentials::logger("step 1. build colored compacted dBG");
2121
timer.start();
2222
m_ccdbg.build(m_build_config);
23-
m_build_config.num_docs = m_ccdbg.num_docs();
23+
m_build_config.num_colors = m_ccdbg.num_colors();
2424
timer.stop();
2525
std::cout << "** building the ccdBG took " << timer.elapsed() << " seconds / "
2626
<< timer.elapsed() / 60 << " minutes" << std::endl;
@@ -40,7 +40,7 @@ struct index<ColorSets>::builder {
4040
std::ofstream out((m_build_config.file_base_name + ".fa").c_str());
4141
if (!out.is_open()) throw std::runtime_error("cannot open output file");
4242

43-
typename ColorSets::builder colors_builder(m_build_config.num_docs);
43+
typename ColorSets::builder colors_builder(m_build_config.num_colors);
4444

4545
m_ccdbg.loop_through_unitigs([&](ggcat::Slice<char> const unitig,
4646
ggcat::Slice<uint32_t> const colors, bool same_color) {

include/color_sets/differential.hpp

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ struct differential {
1212
m_representative_offsets.push_back(0);
1313
}
1414

15-
void init_colors_builder(uint64_t num_docs) {
16-
m_num_docs = num_docs;
15+
void init_colors_builder(uint64_t num_colors) {
16+
m_num_colors = num_colors;
1717
m_num_total_integers = 0;
1818
m_num_lists = 0;
1919
}
@@ -99,7 +99,7 @@ struct differential {
9999
}
100100

101101
void build(differential& d) {
102-
d.m_num_docs = m_num_docs;
102+
d.m_num_colors = m_num_colors;
103103
d.m_colors.swap(m_bvb.bits());
104104
d.m_clusters.build(&m_clusters);
105105

@@ -132,7 +132,7 @@ struct differential {
132132
pthash::bit_vector_builder m_clusters;
133133
uint64_t m_num_total_integers, m_num_lists;
134134

135-
uint64_t m_num_docs;
135+
uint64_t m_num_colors;
136136
uint64_t m_prev_cluster_id;
137137
std::vector<uint64_t> m_representative_offsets, m_list_offsets;
138138
};
@@ -158,11 +158,11 @@ struct differential {
158158
m_size = util::read_delta(m_differential_list_it);
159159

160160
m_curr_differential_val = m_differential_list_size == 0
161-
? num_docs()
161+
? num_colors()
162162
: util::read_delta(m_differential_list_it);
163163
m_prev_differential_val = 0;
164164
m_curr_representative_val =
165-
m_representative_size == 0 ? num_docs() : util::read_delta(m_representative_it);
165+
m_representative_size == 0 ? num_colors() : util::read_delta(m_representative_it);
166166
m_prev_representative_val = 0;
167167

168168
m_pos_in_differential_list = 0;
@@ -178,7 +178,7 @@ struct differential {
178178
void next() {
179179
if (m_pos_in_representative >= m_representative_size &&
180180
m_pos_in_differential_list >= m_differential_list_size) {
181-
m_curr_val = num_docs();
181+
m_curr_val = num_colors();
182182
return;
183183
}
184184
if (m_pos_in_representative >= m_representative_size ||
@@ -193,12 +193,12 @@ struct differential {
193193
void operator++() { next(); }
194194

195195
void next_geq(const uint64_t lower_bound) {
196-
assert(lower_bound <= num_docs());
196+
assert(lower_bound <= num_colors());
197197
while (value() < lower_bound) next();
198198
assert(value() >= lower_bound);
199199
}
200200

201-
uint32_t num_docs() const { return m_ptr->m_num_docs; }
201+
uint32_t num_colors() const { return m_ptr->m_num_colors; }
202202
uint64_t differential_list_size() const { return m_differential_list_size; }
203203

204204
int type() const { return list_type::differential_list; }
@@ -221,7 +221,7 @@ struct differential {
221221
m_curr_representative_val =
222222
m_prev_representative_val + util::read_delta(m_representative_it) + 1;
223223
} else {
224-
m_curr_representative_val = num_docs();
224+
m_curr_representative_val = num_colors();
225225
}
226226
}
227227

@@ -232,7 +232,7 @@ struct differential {
232232
m_curr_differential_val =
233233
m_prev_differential_val + util::read_delta(m_differential_list_it) + 1;
234234
} else {
235-
m_curr_differential_val = num_docs();
235+
m_curr_differential_val = num_colors();
236236
}
237237
}
238238

@@ -259,10 +259,10 @@ struct differential {
259259

260260
uint64_t num_color_sets() const { return m_list_offsets.size() - 1; }
261261
uint64_t num_partitions() const { return m_clusters.num_ones() + 1; }
262-
uint64_t num_docs() const { return m_num_docs; }
262+
uint64_t num_colors() const { return m_num_colors; }
263263

264264
uint64_t num_bits() const {
265-
return sizeof(m_num_docs) * 8 + m_representative_offsets.num_bits() +
265+
return sizeof(m_num_colors) * 8 + m_representative_offsets.num_bits() +
266266
m_list_offsets.num_bits() + essentials::vec_bytes(m_colors) * 8 +
267267
m_clusters.bytes() * 8;
268268
}
@@ -271,16 +271,16 @@ struct differential {
271271
std::cout << "Color statistics:\n";
272272
std::cout << " Number of partitions: " << num_partitions() << std::endl;
273273

274-
uint64_t num_representative_offsets = m_representative_offsets.num_bits();
275-
uint64_t num_list_offsets = m_list_offsets.num_bits();
276-
uint64_t num_colors = essentials::vec_bytes(m_colors) * 8;
277-
uint64_t num_clusters = m_clusters.size();
274+
uint64_t num_bits_representative_offsets = m_representative_offsets.num_bits();
275+
uint64_t num_bits_list_offsets = m_list_offsets.num_bits();
276+
uint64_t num_bits_colors = essentials::vec_bytes(m_colors) * 8;
278277

278+
uint64_t num_clusters = m_clusters.size();
279279
uint64_t num_representatives = 0;
280280
uint64_t num_differential_lists = 0;
281281
uint64_t num_metadata = 0;
282282

283-
uint64_t num_docs_tenth = num_docs() / 10;
283+
uint64_t num_colors_tenth = num_colors() / 10;
284284

285285
std::vector<uint64_t> distribution(11, 0);
286286

@@ -322,29 +322,31 @@ struct differential {
322322
prev_position = it.position();
323323
}
324324
uint64_t q = 0;
325-
if (num_docs_tenth != 0)
326-
q = size / (num_docs_tenth) > 10 ? 10 : size / (num_docs_tenth);
325+
if (num_colors_tenth != 0) {
326+
q = size / (num_colors_tenth) > 10 ? 10 : size / (num_colors_tenth);
327+
}
327328

328329
distribution[q]++;
329330
}
330331

331332
assert(num_bits() > 0);
332-
assert(num_colors > 0);
333+
assert(num_bits_colors > 0);
333334

334-
std::cout << " representative offsets: " << num_representative_offsets / 8 << " bytes ("
335-
<< (num_representative_offsets * 100.0) / num_bits() << "%)" << std::endl;
336-
std::cout << " differential list offsets: " << num_list_offsets / 8 << " bytes ("
337-
<< (num_list_offsets * 100.0) / num_bits() << "%)" << std::endl;
335+
std::cout << " representative offsets: " << num_bits_representative_offsets / 8
336+
<< " bytes (" << (num_bits_representative_offsets * 100.0) / num_bits() << "%)"
337+
<< std::endl;
338+
std::cout << " differential list offsets: " << num_bits_list_offsets / 8 << " bytes ("
339+
<< (num_bits_list_offsets * 100.0) / num_bits() << "%)" << std::endl;
338340
std::cout << " clusters: " << num_clusters / 8 << " bytes ("
339341
<< (num_clusters * 100.0) / num_bits() << "%)" << std::endl;
340-
std::cout << " differential colors: " << num_colors / 8 << " bytes ("
341-
<< (num_colors * 100.0) / num_bits() << "%)" << std::endl;
342+
std::cout << " differential colors: " << num_bits_colors / 8 << " bytes ("
343+
<< (num_bits_colors * 100.0) / num_bits() << "%)" << std::endl;
342344
std::cout << " representatives: " << num_representatives / 8 << " bytes ("
343-
<< (num_representatives * 100.0) / num_colors << "%)" << std::endl;
345+
<< (num_representatives * 100.0) / num_bits_colors << "%)" << std::endl;
344346
std::cout << " differential lists: " << num_differential_lists / 8 << " bytes ("
345-
<< (num_differential_lists * 100.0) / num_colors << "%)" << std::endl;
347+
<< (num_differential_lists * 100.0) / num_bits_colors << "%)" << std::endl;
346348
std::cout << " metadata: " << num_metadata / 8 << " bytes ("
347-
<< (num_metadata * 100.0) / num_colors << "%)" << std::endl;
349+
<< (num_metadata * 100.0) / num_bits_colors << "%)" << std::endl;
348350
std::cout << " differential lists size distribution:" << std::endl;
349351
for (uint64_t partition = 0; partition < 11; partition++) {
350352
std::cout << distribution[partition] << " ";
@@ -365,14 +367,14 @@ struct differential {
365367
private:
366368
template <typename Visitor, typename T>
367369
static void visit_impl(Visitor& visitor, T&& t) {
368-
visitor.visit(t.m_num_docs);
370+
visitor.visit(t.m_num_colors);
369371
visitor.visit(t.m_representative_offsets);
370372
visitor.visit(t.m_list_offsets);
371373
visitor.visit(t.m_colors);
372374
visitor.visit(t.m_clusters);
373375
}
374376

375-
uint32_t m_num_docs;
377+
uint32_t m_num_colors;
376378

377379
sshash::ef_sequence<false> m_representative_offsets, m_list_offsets;
378380

0 commit comments

Comments
 (0)