diff --git a/src/source_language_protocol.cpp b/src/source_language_protocol.cpp index f76a15a..ccdae62 100644 --- a/src/source_language_protocol.cpp +++ b/src/source_language_protocol.cpp @@ -254,6 +254,8 @@ LanguageProtocol::Capabilities LanguageProtocol::Client::initialize(Source::Lang capabilities.type_coverage = capabilities_pt->get("typeCoverageProvider", false); } + capabilities.use_line_index = result.get("offsetEncoding", "") == "utf-8"; + write_notification("initialized", ""); } result_processed.set_value(); @@ -709,13 +711,11 @@ void Source::LanguageProtocolView::setup_navigation_and_refactoring() { }); result_processed.get_future().get(); - auto embolden_token = [](std::string &line, int token_start_pos, int token_end_pos) { - Glib::ustring uline = std::move(line); - - //markup token as bold + auto embolden_token = [this](std::string &line, int token_start_pos, int token_end_pos) { + // Markup token as bold size_t pos = 0; - while((pos = uline.find('&', pos)) != Glib::ustring::npos) { - size_t pos2 = uline.find(';', pos + 2); + while((pos = line.find('&', pos)) != std::string::npos) { + size_t pos2 = line.find(';', pos + 2); if(static_cast(token_start_pos) > pos) { token_start_pos += pos2 - pos; token_end_pos += pos2 - pos; @@ -726,22 +726,27 @@ void Source::LanguageProtocolView::setup_navigation_and_refactoring() { break; pos = pos2 + 1; } - if(static_cast(token_start_pos) > uline.size()) - token_start_pos = uline.size(); - if(static_cast(token_end_pos) > uline.size()) - token_end_pos = uline.size(); - if(token_start_pos != token_end_pos) { - uline.insert(token_end_pos, ""); - uline.insert(token_start_pos, ""); + + if(!capabilities.use_line_index) { + auto code_units_diff = token_end_pos - token_start_pos; + token_start_pos = utf16_code_units_byte_count(line, token_start_pos); + token_end_pos = token_start_pos + utf16_code_units_byte_count(line, code_units_diff, token_start_pos); + } + + if(static_cast(token_start_pos) > line.size()) + token_start_pos = line.size(); + if(static_cast(token_end_pos) > line.size()) + token_end_pos = line.size(); + if(token_start_pos < token_end_pos) { + line.insert(token_end_pos, ""); + line.insert(token_start_pos, ""); } size_t start_pos = 0; - while(start_pos < uline.size() && (uline[start_pos] == ' ' || uline[start_pos] == '\t')) + while(start_pos < line.size() && (line[start_pos] == ' ' || line[start_pos] == '\t')) ++start_pos; if(start_pos > 0) - uline.erase(0, start_pos); - - line = std::move(uline); + line.erase(0, start_pos); }; std::unordered_map> file_lines; @@ -1278,6 +1283,8 @@ void Source::LanguageProtocolView::update_diagnostics(std::vector(text[pos]) <= 0b01111111 || static_cast(text[pos]) >= 0b11000000) + for(; pos < size;) { + if(static_cast(text[pos]) <= 0b01111111) { ++count; + ++pos; + } + else if(static_cast(text[pos]) >= 0b11111000) // Invalid UTF-8 byte + ++pos; + else if(static_cast(text[pos]) >= 0b11110000) { + ++count; + pos += 4; + } + else if(static_cast(text[pos]) >= 0b11100000) { + ++count; + pos += 3; + } + else if(static_cast(text[pos]) >= 0b11000000) { + ++count; + pos += 2; + } + else // // Invalid start of UTF-8 character + ++pos; } return count; } +size_t utf16_code_units_byte_count(const std::string &text, size_t code_units, size_t start_pos) { + if(code_units == 0) + return 0; + + size_t pos = start_pos; + size_t current_code_units = 0; + for(; pos < text.size();) { + if(static_cast(text[pos]) <= 0b01111111) { + ++current_code_units; + ++pos; + if(current_code_units >= code_units) + break; + } + else if(static_cast(text[pos]) >= 0b11111000) // Invalid UTF-8 byte + ++pos; + else if(static_cast(text[pos]) >= 0b11110000) { + current_code_units += 2; + pos += 4; + if(current_code_units >= code_units) + break; + } + else if(static_cast(text[pos]) >= 0b11100000) { + ++current_code_units; + pos += 3; + if(current_code_units >= code_units) + break; + } + else if(static_cast(text[pos]) >= 0b11000000) { + ++current_code_units; + pos += 2; + if(current_code_units >= code_units) + break; + } + else // // Invalid start of UTF-8 character + ++pos; + } + return pos - start_pos; +} + bool starts_with(const char *str, const std::string &test) noexcept { for(size_t i = 0; i < test.size(); ++i) { if(*str == '\0') diff --git a/src/utility.hpp b/src/utility.hpp index 16679e1..798875f 100644 --- a/src/utility.hpp +++ b/src/utility.hpp @@ -8,9 +8,12 @@ public: ~ScopeGuard(); }; -/// Returns number of utf8 characters in text argument +/// Returns number of utf8 characters in text size_t utf8_character_count(const std::string &text, size_t pos = 0, size_t length = std::string::npos) noexcept; +/// Returns number of bytes in the given utf16 code units in text +size_t utf16_code_units_byte_count(const std::string &text, size_t code_units, size_t start_pos = 0); + bool starts_with(const char *str, const std::string &test) noexcept; bool starts_with(const char *str, const char *test) noexcept; bool starts_with(const std::string &str, const std::string &test) noexcept; diff --git a/tests/utility_test.cpp b/tests/utility_test.cpp index b778612..47c1553 100644 --- a/tests/utility_test.cpp +++ b/tests/utility_test.cpp @@ -16,6 +16,40 @@ int main() { g_assert(utf8_character_count("æøå") == 3); g_assert(utf8_character_count("æøåtest") == 7); + g_assert_cmpuint(utf16_code_units_byte_count("", 0), ==, 0); + g_assert_cmpuint(utf16_code_units_byte_count("", 1), ==, 0); + g_assert_cmpuint(utf16_code_units_byte_count("test", 0), ==, 0); + g_assert_cmpuint(utf16_code_units_byte_count("test", 1), ==, 1); + g_assert_cmpuint(utf16_code_units_byte_count("test", 3), ==, 3); + g_assert_cmpuint(utf16_code_units_byte_count("test", 4), ==, 4); + g_assert_cmpuint(utf16_code_units_byte_count("test", 5), ==, 4); + + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 0), ==, 0); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 1), ==, 2); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 2), ==, 4); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 3), ==, 6); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 4), ==, 6); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 5), ==, 6); + + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 0, 2), ==, 0); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 1, 2), ==, 2); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 2, 2), ==, 4); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 3, 2), ==, 4); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 1, 6), ==, 0); + g_assert_cmpuint(utf16_code_units_byte_count("æøå", 0, 6), ==, 0); + + g_assert_cmpuint(strlen("🔥"), ==, 4); // Fire emoji + + g_assert_cmpuint(utf16_code_units_byte_count("🔥", 0), ==, 0); // Fire emoji + g_assert_cmpuint(utf16_code_units_byte_count("🔥", 2), ==, 4); // Fire emoji + g_assert_cmpuint(utf16_code_units_byte_count("🔥", 3), ==, 4); // Fire emoji + g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 0), ==, 0); // Fire emoji between test words + g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 4), ==, 4); // Fire emoji between test words + g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 6), ==, 8); // Fire emoji between test words + g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 7), ==, 9); // Fire emoji between test words + g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 10), ==, 12); // Fire emoji between test words + g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 11), ==, 12); // Fire emoji between test words + std::string empty; std::string test("test"); std::string testtest("testtest");