Skip to content

Commit 14dc125

Browse files
committed
Refactored and Optimized BWT
1 parent f1f88d2 commit 14dc125

File tree

1 file changed

+32
-41
lines changed

1 file changed

+32
-41
lines changed

Compressors/BWT/BWT.h

Lines changed: 32 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
#include "../../Utils/BinaryIO.h"
55
#include "SuffixArray.h"
66
#include <algorithm>
7-
#include <list>
87
#include <cstdint>
8+
#include <deque>
9+
#include <bit_string.h>
910

1011
/**
1112
* Burrows - Wheeler Transform
@@ -19,30 +20,24 @@ class BWT {
1920
static void encode(const std::string& filename, const std::string& outputFileName) {
2021

2122
std::string toBeEncoded = BinaryIO::readString(filename);
22-
23-
toBeEncoded += '\0';
23+
toBeEncoded += '\0'; // Last char must be the smallest of all
2424

2525
std::vector<uint32_t> suffixArray = SuffixArray::buildSuffixArray(toBeEncoded);
2626

2727
std::string bwt = generateBWT(toBeEncoded, suffixArray);
2828

2929
remove(outputFileName.c_str()); // Remove Output File If Exists
30+
BinaryIO::write(outputFileName, bit_string::from_uint_32(originalIndex));
3031
BinaryIO::write(outputFileName, bwt);
31-
BinaryIO::write(outputFileName, originalIndex);
3232

3333
}
3434

3535
static void decode(const std::string& filename, const std::string& outputFileName) {
3636

37-
std::string bwt = BinaryIO::readString(filename);
38-
39-
std::string index;
40-
for (int i = 0; i < sizeof(originalIndex); ++i) {
41-
index.insert(0, 1, bwt.back());
42-
bwt.pop_back();
43-
}
37+
// Original Index is the first 4 bytes of the file
38+
originalIndex = BinaryIO::readBitString(filename, 0, sizeof(originalIndex)).to_uint_32();
39+
std::string bwt = BinaryIO::readString(filename, sizeof(originalIndex));
4440

45-
originalIndex = (uint32_t) Converter::string_ToInt64(index);
4641
std::string inverseBWT = BWT::invertBWT(bwt, originalIndex);
4742
inverseBWT.pop_back(); // remove '\0' which was added at encoding
4843

@@ -53,9 +48,28 @@ class BWT {
5348

5449

5550
private:
56-
static void computeLeftShift(std::list<int>& list, int index, int leftShift[]) {
57-
leftShift[index] = list.front();
58-
list.pop_front();
51+
52+
static std::vector<uint32_t> computeLeftShift(const std::string& bwt) {
53+
54+
// Each element is a list of all indices the this element appeared at in bwt
55+
// example: symbolsIndicesList[65] contains all indices that letter 'A' appears at
56+
std::deque<uint32_t> symbolsIndicesList[256]; // 256 is the length all symbols as we us byte it is 256 maximum
57+
for (int i = 0; i < bwt.length(); i++) {
58+
symbolsIndicesList[(uint8_t) bwt[i]].push_back(i);
59+
}
60+
61+
std::string sortedBWT = bwt;
62+
std::sort(sortedBWT.begin(), sortedBWT.end(), unsignedCharsCompare);
63+
64+
std::vector<uint32_t> leftShift(bwt.length());
65+
// Computes Left Shift
66+
for (int i = 0; i < bwt.length(); i++) {
67+
std::deque<uint32_t>& symbolIndices = symbolsIndicesList[(uint8_t) sortedBWT[i]];
68+
leftShift[i] = symbolIndices.front();
69+
symbolIndices.pop_front();
70+
}
71+
72+
return leftShift;
5973
}
6074

6175
// Generate Burrows - Wheeler Transform of given text
@@ -87,41 +101,18 @@ class BWT {
87101
}
88102

89103
static std::string invertBWT(const std::string& BWT, long long index) {
90-
int length = BWT.length();
91-
std::string sortedBWT = BWT;
92-
int* leftShift = new int[length];
93104

94-
// Sorts the characters of BWT[] alphabetically
95-
std::sort(sortedBWT.begin(), sortedBWT.end(), unsignedCharsCompare);
96-
97-
// Array of pointers that act as head nodes
98-
// to linked lists created to compute leftShift[]
99-
std::list<int> arr[256]; // 256 is the length all symbols as we us byte it is 256 maximum
100-
101-
// Takes each distinct character of BWT[] as head
102-
// of a linked list and appends to it the new node
103-
// whose data part contains index at which
104-
// character occurs in BWT[]
105-
for (int i = 0; i < length; i++) {
106-
arr[(uint8_t) BWT[i]].push_back(i);
107-
}
108-
109-
// Takes each distinct character of sorted_arr[] as head
110-
// of a linked list and finds leftShift[]
111-
for (int i = 0; i < length; i++) {
112-
computeLeftShift(arr[(uint8_t) sortedBWT[i]], i, leftShift);
113-
}
105+
std::vector<uint32_t> leftShift = computeLeftShift(BWT);
114106

115107
std::string inverseBWT;
108+
inverseBWT.reserve(BWT.length());
116109

117110
// Decodes the bwt
118-
for (int i = 0; i < length; i++) {
111+
for (int i = 0; i < BWT.length(); i++) {
119112
index = leftShift[index];
120113
inverseBWT += BWT[index];
121114
}
122115

123-
delete[] leftShift;
124-
125116
return inverseBWT;
126117
}
127118

0 commit comments

Comments
 (0)