|
4 | 4 | #include "include/DataStructures/DataVector/BQDataVectors.h"
|
5 | 5 | #include "include/read_data.h"
|
6 | 6 | #include "include/distance.h"
|
| 7 | +#include <fstream> |
7 | 8 |
|
| 9 | +/** |
| 10 | + * @brief Compute the groundtruth for a set of base and query vectors. |
| 11 | + * |
| 12 | + * This function computes the groundtruth for a set of base and query vectors by calculating the Euclidean distance |
| 13 | + * between each query vector and all base vectors. The function supports only two query types: 0 and 1. For query type 0, |
| 14 | + * the function computes the distances between the query vector and all base vectors. For query type 1, the function computes |
| 15 | + * the distances between the query vector and the base vectors with the same C value. |
| 16 | + * |
| 17 | + * @param base_vectors A vector of BaseDataVector objects representing the base vectors |
| 18 | + * @param query_vectors A vector of QueryDataVector objects representing the query vectors |
| 19 | + * @param maxDistances The maximum number of distances to compute for each query vector |
| 20 | + * |
| 21 | + * @return A 2D vector containing the computed distances for each query vector |
| 22 | + */ |
| 23 | +std::vector<std::vector<float>> computeGroundtruth( |
| 24 | + const std::vector<BaseDataVector<float>> base_vectors, const std::vector<QueryDataVector<float>> query_vectors, const unsigned int maxDistances) { |
8 | 25 |
|
9 |
| -int main(int argc, char* argv[]) { |
| 26 | + // Allocate memory for the distance vector |
| 27 | + std::vector<std::vector<float>> distances(query_vectors.size()); |
10 | 28 |
|
11 |
| - std::vector<BaseDataVector<float>> base_vectors = ReadFilteredBaseVectorFile("data/Dummy/dummy-data.bin"); |
12 |
| - std::vector<QueryDataVector<float>> query_vectors = ReadFilteredQueryVectorFile("data/Dummy/dummy-queries.bin"); |
| 29 | + // Compute the distances between the query vectors and the base vectors (with the same filter) |
| 30 | + // If no filter provided then compute to the whole graph |
| 31 | + for (auto query : query_vectors) { |
13 | 32 |
|
14 |
| - // Allocate memory for the distances between the query and base vectors |
15 |
| - double** distances = new double*[query_vectors.size()]; |
16 |
| - unsigned int** baseVectorIndeces = new unsigned int*[query_vectors.size()]; |
17 |
| - for (unsigned int i = 0; i < query_vectors.size(); i++) { |
18 |
| - distances[i] = new double[base_vectors.size()]; |
19 |
| - baseVectorIndeces[i] = new unsigned int[base_vectors.size()]; |
20 |
| - } |
| 33 | + // Compute the distances between the query vector and all base vectors |
| 34 | + if (query.getQueryType() == 0) { |
| 35 | + for (auto base : base_vectors) { |
| 36 | + distances[query.getIndex()].push_back(euclideanDistance(base, query)); |
| 37 | + } |
| 38 | + } |
21 | 39 |
|
22 |
| - for (unsigned int i = 0; i < query_vectors.size(); i++) { |
23 |
| - for (unsigned int j = 0; j < base_vectors.size(); j++) { |
24 |
| - float distance = euclideanDistance(query_vectors[i], base_vectors[j]); |
25 |
| - distances[i][j] = distance; |
26 |
| - baseVectorIndeces[i][j] = j; |
| 40 | + // If the filter type is C_EQUALS_v then compute the distances between the query vector |
| 41 | + // and the base vectors with the same C value |
| 42 | + else if (query.getQueryType() == 1) { |
| 43 | + for (auto base : base_vectors) { |
| 44 | + if (base.getC() == query.getV()) { |
| 45 | + distances[query.getIndex()].push_back(euclideanDistance(base, query)); |
| 46 | + } |
| 47 | + } |
27 | 48 | }
|
| 49 | + // IMPORTANT: Queries with filter type 2 and 3 are not supported in this version of the application |
28 | 50 | }
|
29 | 51 |
|
30 |
| - // Sort the distances and keep the indeces of the base vectors |
| 52 | + // Sort the distances for each query vector |
31 | 53 | for (unsigned int i = 0; i < query_vectors.size(); i++) {
|
32 |
| - std::sort(baseVectorIndeces[i], baseVectorIndeces[i] + base_vectors.size(), |
33 |
| - [&distances, i](unsigned int a, unsigned int b) { |
34 |
| - return distances[i][a] < distances[i][b]; |
35 |
| - } |
36 |
| - ); |
| 54 | + std::sort(distances[i].begin(), distances[i].end()); |
37 | 55 | }
|
38 | 56 |
|
39 |
| - // Print the indeces of the base vectors with the smallest distances to the query vectors |
40 |
| - for (unsigned int i = 0; i < query_vectors.size(); i++) { |
41 |
| - std::cout << "Query " << i << ":\n"; |
42 |
| - for (unsigned int j = 0; j < 5; j++) { |
43 |
| - std::cout << "Base vector index: " << baseVectorIndeces[i][j] << ", distance: " << distances[i][baseVectorIndeces[i][j]] << std::endl; |
44 |
| - } |
| 57 | + // Return the first `maxDistances` distances for each query vector |
| 58 | + for (auto& query_distances : distances) { |
| 59 | + query_distances.resize(maxDistances); |
45 | 60 | }
|
46 | 61 |
|
47 |
| - // Deallocate memory for the distances |
48 |
| - for (unsigned int i = 0; i < query_vectors.size(); i++) { |
49 |
| - delete[] distances[i]; |
50 |
| - delete[] baseVectorIndeces[i]; |
| 62 | + return distances; |
| 63 | + |
| 64 | +} |
| 65 | + |
| 66 | +/** |
| 67 | + * @brief Save the computed groundtruth distances to a binary file. |
| 68 | + * |
| 69 | + * This function saves the computed groundtruth distances to a binary file. The file format consists of the following: |
| 70 | + * - The number of query vectors (4 bytes) |
| 71 | + * - For each query vector: |
| 72 | + * - The number of distances (4 bytes) |
| 73 | + * - The computed distances (4 bytes each) |
| 74 | + * |
| 75 | + * @param distances A 2D vector containing the computed distances for each query vector |
| 76 | + * @param filename The name of the output file to save the distances |
| 77 | + */ |
| 78 | +void saveGroundtruthToFile(const std::vector<std::vector<float>>& distances, const std::string& filename) { |
| 79 | + std::ofstream file(filename, std::ios::binary); |
| 80 | + |
| 81 | + if (!file.is_open()) { |
| 82 | + std::cerr << "Error opening file: " << filename << std::endl; |
| 83 | + return; |
| 84 | + } |
| 85 | + |
| 86 | + // Write the number of query vectors to the file |
| 87 | + uint32_t num_queries = distances.size(); |
| 88 | + file.write(reinterpret_cast<const char*>(&num_queries), sizeof(num_queries)); |
| 89 | + |
| 90 | + // Write the distances for each query vector |
| 91 | + for (const auto& query_distances : distances) { |
| 92 | + uint32_t num_distances = query_distances.size(); |
| 93 | + file.write(reinterpret_cast<const char*>(&num_distances), sizeof(num_distances)); |
| 94 | + file.write(reinterpret_cast<const char*>(query_distances.data()), num_distances * sizeof(float)); |
| 95 | + } |
| 96 | + |
| 97 | + file.close(); |
| 98 | +} |
| 99 | + |
| 100 | +/** |
| 101 | + * @brief Read the groundtruth distances from a binary file. |
| 102 | + * |
| 103 | + * This function reads the groundtruth distances from a binary file. The file format consists of the following: |
| 104 | + * - The number of query vectors (4 bytes) |
| 105 | + * - For each query vector: |
| 106 | + * - The number of distances (4 bytes) |
| 107 | + * - The computed distances (4 bytes each) |
| 108 | + * |
| 109 | + * @param filename The name of the input file to read the distances from |
| 110 | + * |
| 111 | + * @return A 2D vector containing the read distances for each query vector |
| 112 | + */ |
| 113 | +std::vector<std::vector<float>> readGroundtruthFromFile(const std::string& filename) { |
| 114 | + std::ifstream file(filename, std::ios::binary); |
| 115 | + |
| 116 | + if (!file.is_open()) { |
| 117 | + std::cerr << "Error opening file: " << filename << std::endl; |
| 118 | + return {}; |
| 119 | + } |
| 120 | + |
| 121 | + // Read the number of query vectors from the file |
| 122 | + uint32_t num_queries; |
| 123 | + file.read(reinterpret_cast<char*>(&num_queries), sizeof(num_queries)); |
| 124 | + |
| 125 | + std::vector<std::vector<float>> distances(num_queries); |
| 126 | + |
| 127 | + // Read the distances for each query vector |
| 128 | + for (auto& query_distances : distances) { |
| 129 | + uint32_t num_distances; |
| 130 | + file.read(reinterpret_cast<char*>(&num_distances), sizeof(num_distances)); |
| 131 | + query_distances.resize(num_distances); |
| 132 | + file.read(reinterpret_cast<char*>(query_distances.data()), num_distances * sizeof(float)); |
51 | 133 | }
|
52 |
| - delete[] distances; |
53 |
| - delete[] baseVectorIndeces; |
| 134 | + |
| 135 | + file.close(); |
| 136 | + return distances; |
| 137 | +} |
| 138 | + |
| 139 | +int main(int argc, char* argv[]) { |
| 140 | + |
| 141 | + std::vector<BaseDataVector<float>> base_vectors = ReadFilteredBaseVectorFile("data/Dummy/dummy-data.bin"); |
| 142 | + std::vector<QueryDataVector<float>> query_vectors = ReadFilteredQueryVectorFile("data/Dummy/dummy-queries.bin"); |
| 143 | + |
| 144 | + // Compute the distance vector, and save the computed distances to a file |
| 145 | + std::vector<std::vector<float>> distances = computeGroundtruth(base_vectors, query_vectors, 1000); |
| 146 | + saveGroundtruthToFile(distances, "data/Dummy/dummy-groundtruth.bin"); |
| 147 | + |
| 148 | + // Example usage of readGroundtruthFromFile |
| 149 | + std::vector<std::vector<float>> read_distances = readGroundtruthFromFile("data/Dummy/dummy-groundtruth.bin"); |
54 | 150 |
|
55 | 151 | return 0;
|
56 | 152 |
|
|
0 commit comments