Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 54 additions & 47 deletions cpp/src/mlt/decode/string.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ class StringDecoder {
throw std::runtime_error("Data stream missing logical type");
}
dictType = streamMetadata->getLogicalStreamType()->getDictionaryType();
/// can we only get 2 dictionarytypes in here?
auto& target = (dictType == DictionaryType::SINGLE) ? dictionaryStream : symbolStream;
/// DictionaryTpye::FSST is not used?
decodeRaw(tileData, target, streamMetadata->getByteLength(), /*consume=*/true);
break;
}
Expand Down Expand Up @@ -209,53 +211,6 @@ class StringDecoder {
decompressedLength);
}

static std::vector<std::uint8_t> decodeFSST(const std::uint8_t* symbols,
const std::size_t symbolCount,
const std::uint32_t* symbolLengths,
const std::size_t symbolLengthCount,
const std::uint8_t* compressedData,
const std::size_t compressedDataCount,
const std::size_t decompressedLength) {
std::vector<std::uint8_t> output;

if (decompressedLength > 0) {
output.resize(decompressedLength);
}
std::vector<std::uint32_t> symbolOffsets(symbolLengthCount);
for (size_t i = 1; i < symbolLengthCount; i++) {
symbolOffsets[i] = symbolOffsets[i - 1] + symbolLengths[i - 1];
}

std::size_t idx = 0;
for (size_t i = 0; i < compressedDataCount; i++) {
const std::uint8_t symbolIndex = compressedData[i];

// 255 is our escape byte -> take the next symbol as it is
if (symbolIndex == 255) {
if (idx == output.size()) {
output.resize(output.size() * 2);
}
output[idx++] = compressedData[++i];
} else if (symbolIndex < symbolLengthCount) {
const auto len = symbolLengths[symbolIndex];
if (idx + len > output.size()) {
output.resize((output.size() + len) * 2);
}
const auto offset = symbolOffsets[symbolIndex];
if (offset >= symbolCount) {
throw std::runtime_error("FSST decode: symbol index out of bounds");
}
std::memcpy(&output[idx], &symbols[offset], len);
idx += len;
} else {
throw std::runtime_error("FSST decode: invalid symbol index");
}
}

output.resize(idx);
return output;
}

private:
IntegerDecoder& intDecoder;

Expand Down Expand Up @@ -317,6 +272,58 @@ class StringDecoder {
out.push_back(dictionary[offsets[offsetIndex++]]);
}
}

static std::vector<std::uint8_t> decodeFSST(const std::uint8_t* symbols,
const std::size_t symbolCount,
const std::uint32_t* symbolLengths,
const std::size_t symbolLengthCount,
const std::uint8_t* compressedData,
const std::size_t compressedDataCount,
const std::size_t decompressedLength) {
std::vector<std::uint8_t> output;
output.reserve(decompressedLength);

std::vector<std::uint32_t> symbolOffsets;
for (size_t i = 1; i < symbolLengthCount; i++) {
symbolOffsets[i] = symbolOffsets[i - 1] + symbolLengths[i - 1];
}

for (size_t i = 0; i < compressedDataCount; i++) {
const std::uint8_t symbolIndex = compressedData[i];
// 255 is our escape byte -> take the next symbol as it is
if (symbolIndex == 255) {
/// this operation just copies the plain strings which are uncompressed
if (compressedData[i + 1] == 255) {
throw std::runtime_error("FSST decode: two escape sequences in a row detected index");
}
output.push_back(compressedData[++i]);
} else if (symbolIndex < symbolLengthCount) {
const auto len = symbolLengths[symbolIndex];
const auto offset = symbolOffsets[symbolIndex];
if (offset >= symbolCount) {
throw std::runtime_error("FSST decode: symbol index out of bounds");
}
const std::uint8_t* start = symbols + offset;
const std::uint8_t* end = start + len;
output.insert(output.end(), start, end);
} else {
throw std::runtime_error("FSST decode: invalid symbol index");
}
}
return output;
/* the code below provides a faster lookup in my opinion. It is the "easy" example from the fsst paper.
This is currently not possible since the symbols are already tightly packed inside the byte stream for fsst
encoding. The trade-off was made for tighter packing for the symbol table

We can decode 8bytes of string value via
void decodeSingleByteviaFSST(uint8_t in[], uint8_t out[],
uint64_t sym[256], uint8_t len[256]){
uint8_t code = *in++;
*((uint64_t*)out) = sym[code];
out += len[code];
}
*/
}
};

} // namespace mlt::decoder
57 changes: 56 additions & 1 deletion cpp/test/test_fsst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <string>
#include <vector>

TEST(FSST, DecodeFromJava) {
TEST(FSST, DecodeFromJava_decode1) {
const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCC";
const std::vector<std::uint8_t> symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102};
const std::vector<std::uint32_t> symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1};
Expand All @@ -17,14 +17,69 @@ TEST(FSST, DecodeFromJava) {

EXPECT_EQ(decoded.size(), expected.size());
EXPECT_EQ(0, memcmp(expected.c_str(), decoded.data(), expected.size()));
}

TEST(FSST, DecodeFromJava_decode2) {
const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCC";
const std::vector<std::uint8_t> symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102};
const std::vector<std::uint32_t> symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1};
const std::vector<std::uint8_t> javaCompressed = {0, 0, 0, 3, 4, 4, 4, 0, 3, 5, 5, 2, 2, 7, 1,
1, 1, 8, 8, 8, 1, 1, 0, 0, 3, 2, 2, 5, 5};

// also make sure buffer growth works
const auto decoded2 = mlt::decoder::StringDecoder::decodeFSST(symbols, symbolLengths, javaCompressed, 0);
EXPECT_EQ(decoded2.size(), expected.size());
EXPECT_EQ(0, memcmp(expected.c_str(), decoded2.data(), expected.size()));
}

TEST(FSST, DecodeFromJava_decode3) {
const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCC";
const std::vector<std::uint8_t> symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102};
const std::vector<std::uint32_t> symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1};
const std::vector<std::uint8_t> javaCompressed = {0, 0, 0, 3, 4, 4, 4, 0, 3, 5, 5, 2, 2, 7, 1,
1, 1, 8, 8, 8, 1, 1, 0, 0, 3, 2, 2, 5, 5};

const auto decoded3 = mlt::decoder::StringDecoder::decodeFSST(
symbols, symbolLengths, javaCompressed, expected.size() / 2);
EXPECT_EQ(decoded3.size(), expected.size());
EXPECT_EQ(0, memcmp(expected.c_str(), decoded3.data(), expected.size() / 2));
}

TEST(FSST, DecodeFromJava_With_one_Escape_character) {
const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCCk";
const std::vector<std::uint8_t> symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102};
const std::vector<std::uint32_t> symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1, 1};
const std::vector<std::uint8_t> javaCompressed = {0, 0, 0, 3, 4, 4, 4, 0, 3, 5, 5, 2, 2, 7, 1, 1,
1, 8, 8, 8, 1, 1, 0, 0, 3, 2, 2, 5, 5, 255, 107};

const auto decoded = mlt::decoder::StringDecoder::decodeFSST(
symbols, symbolLengths, javaCompressed, expected.size());
EXPECT_EQ(decoded.size(), expected.size());
EXPECT_EQ(0, memcmp(expected.c_str(), decoded.data(), expected.size()));
}

TEST(FSST, DecodeFromJava_With_multiple_Escape_characters) {
const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCCkkk";
const std::vector<std::uint8_t> symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102};
const std::vector<std::uint32_t> symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1, 1};
const std::vector<std::uint8_t> javaCompressed = {0, 0, 0, 3, 4, 4, 4, 0, 3, 5, 5, 2, 2, 7, 1, 1, 1, 8,
8, 8, 1, 1, 0, 0, 3, 2, 2, 5, 5, 255, 107, 255, 107, 255, 107};

const auto decoded = mlt::decoder::StringDecoder::decodeFSST(
symbols, symbolLengths, javaCompressed, expected.size());
EXPECT_EQ(decoded.size(), expected.size());
EXPECT_EQ(0, memcmp(expected.c_str(), decoded.data(), expected.size()));
}

TEST(FSST, DecodeFromJava_With_one_single_escaped_character) {
const std::string expected = "k";
const std::vector<std::uint8_t> symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102};
const std::vector<std::uint32_t> symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1};
;
const std::vector<std::uint8_t> javaCompressed = {255, 107};

const auto decoded = mlt::decoder::StringDecoder::decodeFSST(
symbols, symbolLengths, javaCompressed, expected.size());
EXPECT_EQ(decoded.size(), expected.size());
EXPECT_EQ(0, memcmp(expected.c_str(), decoded.data(), expected.size()));
}
4 changes: 2 additions & 2 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,13 @@ mkdocs-build:
cd mkdocs && docker run --rm -v ${PWD}:/docs squidfunk/mkdocs-material build --strict

# Build Java encoder and generate .mlt files for all .pbf files in test/fixtures
[working-directory: 'java']
#[working-directory: 'java']
generate-expected-mlt: (cargo-install 'fd' 'fd-find')
./gradlew cli
fd . ../test/fixtures --no-ignore --extension pbf --extension mvt -x {{quote(just_executable())}} generate-one-expected-mlt

# Generate a single .mlt file for a given .mvt or .pbf file, assuming JAR is built
[working-directory: 'java']
#[working-directory: 'java']
[private]
generate-one-expected-mlt file:
java \
Expand Down