Skip to content

Commit f1f88d2

Browse files
committed
Optimized Huffman with help of Bit String
1 parent fea813c commit f1f88d2

File tree

1 file changed

+94
-82
lines changed

1 file changed

+94
-82
lines changed

Compressors/Huffman/Huffman.h

Lines changed: 94 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,32 @@
77
#include <cstdint>
88

99
#include "Node.h"
10-
#include "../../Utils/Converter.h"
1110
#include "../../Utils/BinaryIO.h"
12-
13-
11+
#include <bit_string.h>
12+
13+
14+
/**
15+
* David Huffman Encoding and Decoding
16+
*
17+
* @File_Format
18+
* ___________________________________________________ _
19+
* | File_Header_Size_In_Bits (2 Bytes) | \
20+
* |_________________________________________________| \
21+
* | File Header (Dictionary): | \
22+
* | Each Item consists of: | Length of This Part in bits is File_Header_Size_In_Bits
23+
* | - Original Value (1 Byte) [ex. 'A' -> 0x41] | There may be some extra bits if size does not fit in bytes
24+
* | - Huffman_Code_Length (1 Byte) | /
25+
* | - Huffman Code (Huffman_Code_Length Bits) | /
26+
* | | /
27+
* |_________________________________________________| -
28+
* | |
29+
* | Huffman Coded Data (Rest of the file) |
30+
* |_________________________________________________|
31+
* | Extra Bits in Last Byte (1 Byte) |
32+
* | There may be some extra bits, if size of Data |
33+
* | does not fit in bytes |
34+
* ---------------------------------------------------
35+
*/
1436
class Huffman {
1537

1638
static const uint32_t BYTE = 8;
@@ -28,32 +50,41 @@ class Huffman {
2850
deallocateHuffmanTree(huffmanTree); // release memory
2951
frequencies.clear(); // release memory
3052

31-
std::string binaryFileHeader = Converter::bitString_ToRealBinary(generateFileHeader(huffmanCodes));
53+
bit_string binaryFileHeader = generateFileHeader(huffmanCodes);
3254

3355
remove(outputFileName.c_str()); // Remove Output File If Exists
3456
BinaryIO::write(outputFileName, binaryFileHeader);
3557

3658
binaryFileHeader.clear();
3759
binaryFileHeader.shrink_to_fit(); // release memory
3860

39-
std::string encodedData = encode(toBeEncoded, huffmanCodes);
61+
bit_string encodedData = encode(toBeEncoded, huffmanCodes);
4062

4163
BinaryIO::write(outputFileName, encodedData);
4264
}
4365

4466

4567
static void decode(const std::string& filename, const std::string& outputFileName) {
4668

47-
uint32_t fileHeaderSizeInBytes = byteSize(decodeFileHeaderSizeInBits(BinaryIO::readString(filename, 0, 2)));
69+
// File Header Size In Bits is The First 2 Bytes of Data
70+
uint32_t fileHeaderSizeInBits = BinaryIO::readBitString(filename, 0, 2).to_uint_32();
71+
72+
// The remaining of the file header is the huffman dictionary
73+
bit_string dictionary = BinaryIO::readBitString(filename, 2, byteSize(fileHeaderSizeInBits));
74+
75+
uint32_t dictionarySizeInBits = fileHeaderSizeInBits - 2 * BYTE;
4876

49-
std::string fileHeader = BinaryIO::readString(filename, 0, fileHeaderSizeInBytes);
77+
// Remove Extra bits in last byte
78+
dictionary.pop_back(dictionary.size() - dictionarySizeInBits);
5079

51-
auto huffmanCodes = reconstructHuffmanCodes(fileHeader);
80+
auto huffmanCodes = reconstructHuffmanCodes(dictionary);
81+
dictionary.clear();
82+
dictionary.shrink_to_fit(); // release memory
5283
Node* huffmanTree = reconstructHuffmanTree(huffmanCodes);
5384

54-
std::string toBeDecoded = BinaryIO::readString(filename, fileHeaderSizeInBytes);
85+
bit_string toBeDecoded = BinaryIO::readBitString(filename, byteSize(fileHeaderSizeInBits));
5586

56-
std::string decodedData = decode(toBeDecoded, huffmanCodes, huffmanTree);
87+
bit_string decodedData = decode(toBeDecoded, huffmanCodes, huffmanTree);
5788

5889
remove(outputFileName.c_str()); // Remove Output File If Exists
5990
BinaryIO::write(outputFileName, decodedData);
@@ -69,37 +100,39 @@ class Huffman {
69100
}
70101

71102

72-
static std::string encode(const std::string& toBeEncoded, std::unordered_map<char, std::string>& huffmanCodes) {
73-
std::string encodedString;
103+
static bit_string encode(const std::string& toBeEncoded, std::unordered_map<uint8_t, bit_string>& huffmanCodes) {
104+
bit_string encodedData;
105+
encodedData.reserve(toBeEncoded.size() / (4 * BYTE));
74106
for (char c : toBeEncoded) {
75-
encodedString += huffmanCodes[c];
107+
encodedData += huffmanCodes[c];
76108
}
77109

78-
std::string encodedData = Converter::bitString_ToRealBinary(encodedString);
79-
uint8_t extraBitsInLastByte = getNumberOfExtraBitsInLastByte(encodedString);
110+
uint8_t extraBitsInLastByte = encodedData.extra_bits_size();
111+
for (int i = 0; i < extraBitsInLastByte; ++i) {
112+
encodedData.push_back(0);
113+
}
80114

81115
encodedData += extraBitsInLastByte;
82-
83116
return encodedData;
84117
}
85118

86119

87-
static std::string decode(std::string& toBeDecoded, std::unordered_map<std::string, char>& huffmanCodes, Node* huffmanTree) {
120+
static bit_string decode(bit_string& toBeDecoded, std::unordered_map<bit_string, uint8_t>& huffmanCodes, Node* huffmanTree) {
88121

89-
int extraBitsInLastByte = toBeDecoded.back();
90-
toBeDecoded.pop_back();
122+
uint8_t extraBitsInLastByte = toBeDecoded.back_byte();
123+
toBeDecoded.pop_back(BYTE);
124+
toBeDecoded.pop_back(extraBitsInLastByte);
91125

92-
std::string toBeDecodedBitString = Converter::string_ToBitString(toBeDecoded);
126+
bit_string decodedData;
127+
decodedData.reserve(toBeDecoded.size());
93128

94-
toBeDecodedBitString.erase(toBeDecodedBitString.length() - extraBitsInLastByte);
95-
96-
std::string decodedData, currentCode;
129+
bit_string currentCode;
97130
Node* currentNode = huffmanTree;
98-
for (char toBeDecodedBit : toBeDecodedBitString) {
131+
for (bool toBeDecodedBit : toBeDecoded) {
99132

100-
if (toBeDecodedBit == '1')
133+
if (toBeDecodedBit == 1)
101134
currentNode = currentNode->left;
102-
else if (toBeDecodedBit == '0')
135+
else if (toBeDecodedBit == 0)
103136
currentNode = currentNode->right;
104137

105138
currentCode += toBeDecodedBit;
@@ -113,17 +146,10 @@ class Huffman {
113146
return decodedData;
114147
}
115148

116-
static uint32_t decodeFileHeaderSizeInBits(const std::string& fileHeader) {
117-
uint32_t fileHeaderSizeInBits = (uint8_t) fileHeader[0];
118-
fileHeaderSizeInBits <<= BYTE;
119-
fileHeaderSizeInBits += (uint8_t) fileHeader[1];
120-
return fileHeaderSizeInBits;
121-
}
122149

123-
124-
static std::unordered_map<char , std::string> generateHuffmanCodes(Node* huffmanTree) {
125-
std::unordered_map<char, std::string> huffmanCodes;
126-
std::string code;
150+
static std::unordered_map<uint8_t, bit_string> generateHuffmanCodes(Node* huffmanTree) {
151+
std::unordered_map<uint8_t, bit_string> huffmanCodes;
152+
bit_string code;
127153

128154
std::function<void(Node*)> generateHuffmanCodesRecursive = [&](Node* node) {
129155
if (node == nullptr)
@@ -134,9 +160,9 @@ class Huffman {
134160
return;
135161
}
136162

137-
code.push_back('1');
163+
code.push_back(1);
138164
generateHuffmanCodesRecursive(node->left);
139-
code.back() = '0';
165+
code.back() = 0;
140166
generateHuffmanCodesRecursive(node->right);
141167
code.pop_back();
142168
};
@@ -147,35 +173,26 @@ class Huffman {
147173
}
148174

149175

150-
// Read the file Header and construct the Huffman Codes Dictionary
151-
static std::unordered_map<std::string, char> reconstructHuffmanCodes(const std::string& fileHeader) {
152-
153-
uint32_t dictionaryLengthInBits = decodeFileHeaderSizeInBits(fileHeader) - (2u * BYTE);
176+
// Read the File Header and construct the Huffman Codes Dictionary
177+
static std::unordered_map<bit_string, uint8_t> reconstructHuffmanCodes(const bit_string& dictionary) {
154178

155-
std::string dictionaryBits = Converter::string_ToBitString(fileHeader, 2, fileHeader.length());
179+
std::unordered_map<bit_string, uint8_t> huffmanCodes(dictionary.size()/4);
156180

157-
// if the header does not fit in bytes, i.e. there are extra bits which does not belong to it
158-
while (dictionaryBits.length() > dictionaryLengthInBits)
159-
dictionaryBits.pop_back();
160-
161-
std::unordered_map<std::string, char> huffmanCodes;
162-
uint8_t value;
163-
std::string code;
164-
for (int i = 0; i < dictionaryLengthInBits;) {
165-
value = Converter::bitString_ToRealBinary(dictionaryBits, i, BYTE)[0];
181+
for (int i = 0; i < dictionary.size();) {
182+
uint8_t value = dictionary.substr(i, BYTE).to_uint_8();
166183
i += BYTE;
167-
uint32_t codeLength = Converter::bitString_ToInt(dictionaryBits.substr(i, BYTE));
184+
uint32_t codeLength = dictionary.substr(i, BYTE).to_uint_32();
168185
i += BYTE;
169-
code = dictionaryBits.substr(i, codeLength);
186+
bit_string code = dictionary.substr(i, codeLength);
170187
i += codeLength;
171-
huffmanCodes[code] = value;
188+
huffmanCodes[std::move(code)] = value;
172189
}
173190

174191
return huffmanCodes;
175192
}
176193

177194

178-
static Node* buildHuffmanTree(const std::unordered_map<char, int>& frequencies) {
195+
static Node* buildHuffmanTree(const std::unordered_map<uint8_t, uint32_t>& frequencies) {
179196
std::priority_queue<Node*, std::vector<Node*>, Node::Compare> pq;
180197

181198
// for the readability purposes
@@ -202,7 +219,7 @@ class Huffman {
202219
}
203220

204221

205-
static Node* reconstructHuffmanTree(const std::unordered_map<std::string, char> &huffmanCodes) {
222+
static Node* reconstructHuffmanTree(const std::unordered_map<bit_string, uint8_t> &huffmanCodes) {
206223

207224
Node* root = new Node;
208225
Node* currentNode;
@@ -213,13 +230,13 @@ class Huffman {
213230

214231
for (const auto& huffmanCode : huffmanCodes) {
215232
currentNode = root;
216-
for (char currentCodeBit : huffmanCode.code) {
217-
if (currentCodeBit == '1') { // goes left
233+
for (bool currentCodeBit : huffmanCode.code) {
234+
if (currentCodeBit == 1) { // goes left
218235
if (currentNode->left == nullptr) {
219236
currentNode->left = new Node;
220237
}
221238
currentNode = currentNode->left;
222-
} else if (currentCodeBit == '0') { // goes right
239+
} else if (currentCodeBit == 0) { // goes right
223240
if (currentNode->right == nullptr) {
224241
currentNode->right = new Node;
225242
}
@@ -256,25 +273,26 @@ class Huffman {
256273
}
257274

258275

259-
static std::unordered_map<char, int> generateFrequencies(const std::string& input) {
260-
std::unordered_map<char, int> frequencies;
261-
for (char currentChar : input) {
276+
static std::unordered_map<uint8_t, uint32_t> generateFrequencies(const std::string& input) {
277+
std::unordered_map<uint8_t, uint32_t> frequencies(256);
278+
for (uint8_t currentChar : input) {
262279
frequencies[currentChar]++;
263280
}
264281
return frequencies;
265282
}
266283

267284

268-
static std::string generateFileHeaderDictionary(const std::unordered_map<char, std::string>& huffmanCodes) {
285+
static bit_string generateFileHeaderDictionary(const std::unordered_map<uint8_t , bit_string>& huffmanCodes) {
269286
#define encodedValue first
270287
#define code second
271288

272-
std::string dictionary;
289+
bit_string dictionary;
290+
dictionary.reserve(huffmanCodes.size());
273291

274292
for (auto& huffmanCode : huffmanCodes) {
275-
dictionary += Converter::int8_ToBitString(huffmanCode.encodedValue);
293+
dictionary += huffmanCode.encodedValue;
276294
uint8_t currentCodeLength = huffmanCode.code.length();
277-
dictionary += Converter::int8_ToBitString(currentCodeLength);
295+
dictionary += currentCodeLength;
278296
dictionary += huffmanCode.code;
279297
}
280298

@@ -285,27 +303,21 @@ class Huffman {
285303
}
286304

287305

288-
static std::string generateFileHeader(const std::unordered_map<char, std::string>& huffmanCodes) {
289-
290-
std::string dictionary = generateFileHeaderDictionary(huffmanCodes);
306+
static bit_string generateFileHeader(const std::unordered_map<uint8_t , bit_string>& huffmanCodes) {
307+
// TODO: More Optimization
308+
bit_string dictionary = generateFileHeaderDictionary(huffmanCodes);
291309

292-
// 2u * BYTE is the size of "fileHeaderSize" itself as it is added to the header
293-
uint16_t fileHeaderSize = 2u * BYTE + dictionary.length();
310+
// The size of "fileHeaderSizeInBits" itself as it is added to the header
311+
uint16_t sizeOfFileHeaderSize = 2u * BYTE;
312+
uint16_t fileHeaderSizeInBits = sizeOfFileHeaderSize + dictionary.length();
294313

295-
// File Header Size in bits (it is 2 Bytes -> max of 8KB header)
296-
std::string fileHeaderSizeInBits = Converter::int16_ToBitString(fileHeaderSize);
297-
298-
std::string fileHeader = fileHeaderSizeInBits + dictionary;
314+
bit_string fileHeader = bit_string::from_uint_16(fileHeaderSizeInBits);
315+
fileHeader += dictionary;
299316
return fileHeader;
300317
}
301318

302-
static uint8_t getNumberOfExtraBitsInLastByte(const std::string& encodedString) {
303-
int maxLengthFitInBytes = encodedString.length() & (~(BYTE - 1u));
304-
return (BYTE - (encodedString.length() - maxLengthFitInBytes));
305-
}
306-
307319

308320
};
309321

310322

311-
#endif //HUFFMAN_H
323+
#endif //HUFFMAN_H

0 commit comments

Comments
 (0)