Language client: improved support for both UTF-16 offsets and offsetEncoding set to utf-8

5 years ago · be5e36627d
5 changed files with 122 additions and 20 deletions
--- a/src/source_language_protocol.cpp
+++ b/src/source_language_protocol.cpp
@ -254,6 +254,8 @@ LanguageProtocol::Capabilities LanguageProtocol::Client::initialize(Source::Lang
            capabilities.type_coverage = capabilities_pt->get<bool>("typeCoverageProvider", false);
          }

+          capabilities.use_line_index = result.get<std::string>("offsetEncoding", "") == "utf-8";
+
          write_notification("initialized", "");
        }
        result_processed.set_value();
@ -709,13 +711,11 @@ void Source::LanguageProtocolView::setup_navigation_and_refactoring() {
      });
      result_processed.get_future().get();

-      auto embolden_token = [](std::string &line, int token_start_pos, int token_end_pos) {
-        Glib::ustring uline = std::move(line);
-
-        //markup token as bold
+      auto embolden_token = [this](std::string &line, int token_start_pos, int token_end_pos) {
+        // Markup token as bold
        size_t pos = 0;
-        while((pos = uline.find('&', pos)) != Glib::ustring::npos) {
-          size_t pos2 = uline.find(';', pos + 2);
+        while((pos = line.find('&', pos)) != std::string::npos) {
+          size_t pos2 = line.find(';', pos + 2);
          if(static_cast<size_t>(token_start_pos) > pos) {
            token_start_pos += pos2 - pos;
            token_end_pos += pos2 - pos;
@ -726,22 +726,27 @@ void Source::LanguageProtocolView::setup_navigation_and_refactoring() {
            break;
          pos = pos2 + 1;
        }
-        if(static_cast<size_t>(token_start_pos) > uline.size())
-          token_start_pos = uline.size();
-        if(static_cast<size_t>(token_end_pos) > uline.size())
-          token_end_pos = uline.size();
-        if(token_start_pos != token_end_pos) {
-          uline.insert(token_end_pos, "</b>");
-          uline.insert(token_start_pos, "<b>");
+
+        if(!capabilities.use_line_index) {
+          auto code_units_diff = token_end_pos - token_start_pos;
+          token_start_pos = utf16_code_units_byte_count(line, token_start_pos);
+          token_end_pos = token_start_pos + utf16_code_units_byte_count(line, code_units_diff, token_start_pos);
+        }
+
+        if(static_cast<size_t>(token_start_pos) > line.size())
+          token_start_pos = line.size();
+        if(static_cast<size_t>(token_end_pos) > line.size())
+          token_end_pos = line.size();
+        if(token_start_pos < token_end_pos) {
+          line.insert(token_end_pos, "</b>");
+          line.insert(token_start_pos, "<b>");
        }

        size_t start_pos = 0;
-        while(start_pos < uline.size() && (uline[start_pos] == ' ' || uline[start_pos] == '\t'))
+        while(start_pos < line.size() && (line[start_pos] == ' ' || line[start_pos] == '\t'))
          ++start_pos;
        if(start_pos > 0)
-          uline.erase(0, start_pos);
-
-        line = std::move(uline);
+          line.erase(0, start_pos);
      };

      std::unordered_map<std::string, std::vector<std::string>> file_lines;
@ -1278,6 +1283,8 @@ void Source::LanguageProtocolView::update_diagnostics(std::vector<LanguageProtoc
 }

 Gtk::TextIter Source::LanguageProtocolView::get_iter_at_line_pos(int line, int pos) {
+  if(capabilities.use_line_index)
+    return get_iter_at_line_index(line, pos);
  return get_iter_at_line_offset(line, pos);
 }

--- a/src/source_language_protocol.hpp
+++ b/src/source_language_protocol.hpp
@ -110,6 +110,7 @@ namespace LanguageProtocol {
    bool rename = false;
    bool code_action = false;
    bool type_coverage = false;
+    bool use_line_index = false;
  };

  std::string escape_text(std::string text);
--- a/src/utility.cpp
+++ b/src/utility.cpp
@ -9,13 +9,70 @@ ScopeGuard::~ScopeGuard() {
 size_t utf8_character_count(const std::string &text, size_t pos, size_t length) noexcept {
  size_t count = 0;
  auto size = length == std::string::npos ? text.size() : std::min(pos + length, text.size());
-  for(; pos < size; ++pos) {
-    if(static_cast<unsigned char>(text[pos]) <= 0b01111111 || static_cast<unsigned char>(text[pos]) >= 0b11000000)
+  for(; pos < size;) {
+    if(static_cast<unsigned char>(text[pos]) <= 0b01111111) {
      ++count;
+      ++pos;
+    }
+    else if(static_cast<unsigned char>(text[pos]) >= 0b11111000) // Invalid UTF-8 byte
+      ++pos;
+    else if(static_cast<unsigned char>(text[pos]) >= 0b11110000) {
+      ++count;
+      pos += 4;
+    }
+    else if(static_cast<unsigned char>(text[pos]) >= 0b11100000) {
+      ++count;
+      pos += 3;
+    }
+    else if(static_cast<unsigned char>(text[pos]) >= 0b11000000) {
+      ++count;
+      pos += 2;
+    }
+    else // // Invalid start of UTF-8 character
+      ++pos;
  }
  return count;
 }

+size_t utf16_code_units_byte_count(const std::string &text, size_t code_units, size_t start_pos) {
+  if(code_units == 0)
+    return 0;
+
+  size_t pos = start_pos;
+  size_t current_code_units = 0;
+  for(; pos < text.size();) {
+    if(static_cast<unsigned char>(text[pos]) <= 0b01111111) {
+      ++current_code_units;
+      ++pos;
+      if(current_code_units >= code_units)
+        break;
+    }
+    else if(static_cast<unsigned char>(text[pos]) >= 0b11111000) // Invalid UTF-8 byte
+      ++pos;
+    else if(static_cast<unsigned char>(text[pos]) >= 0b11110000) {
+      current_code_units += 2;
+      pos += 4;
+      if(current_code_units >= code_units)
+        break;
+    }
+    else if(static_cast<unsigned char>(text[pos]) >= 0b11100000) {
+      ++current_code_units;
+      pos += 3;
+      if(current_code_units >= code_units)
+        break;
+    }
+    else if(static_cast<unsigned char>(text[pos]) >= 0b11000000) {
+      ++current_code_units;
+      pos += 2;
+      if(current_code_units >= code_units)
+        break;
+    }
+    else // // Invalid start of UTF-8 character
+      ++pos;
+  }
+  return pos - start_pos;
+}
+
 bool starts_with(const char *str, const std::string &test) noexcept {
  for(size_t i = 0; i < test.size(); ++i) {
    if(*str == '\0')
--- a/src/utility.hpp
+++ b/src/utility.hpp
@ -8,9 +8,12 @@ public:
  ~ScopeGuard();
 };

-/// Returns number of utf8 characters in text argument
+/// Returns number of utf8 characters in text
 size_t utf8_character_count(const std::string &text, size_t pos = 0, size_t length = std::string::npos) noexcept;

+/// Returns number of bytes in the given utf16 code units in text
+size_t utf16_code_units_byte_count(const std::string &text, size_t code_units, size_t start_pos = 0);
+
 bool starts_with(const char *str, const std::string &test) noexcept;
 bool starts_with(const char *str, const char *test) noexcept;
 bool starts_with(const std::string &str, const std::string &test) noexcept;
--- a/tests/utility_test.cpp
+++ b/tests/utility_test.cpp
@ -16,6 +16,40 @@ int main() {
  g_assert(utf8_character_count("æøå") == 3);
  g_assert(utf8_character_count("æøåtest") == 7);

+  g_assert_cmpuint(utf16_code_units_byte_count("", 0), ==, 0);
+  g_assert_cmpuint(utf16_code_units_byte_count("", 1), ==, 0);
+  g_assert_cmpuint(utf16_code_units_byte_count("test", 0), ==, 0);
+  g_assert_cmpuint(utf16_code_units_byte_count("test", 1), ==, 1);
+  g_assert_cmpuint(utf16_code_units_byte_count("test", 3), ==, 3);
+  g_assert_cmpuint(utf16_code_units_byte_count("test", 4), ==, 4);
+  g_assert_cmpuint(utf16_code_units_byte_count("test", 5), ==, 4);
+
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 0), ==, 0);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 1), ==, 2);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 2), ==, 4);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 3), ==, 6);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 4), ==, 6);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 5), ==, 6);
+
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 0, 2), ==, 0);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 1, 2), ==, 2);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 2, 2), ==, 4);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 3, 2), ==, 4);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 1, 6), ==, 0);
+  g_assert_cmpuint(utf16_code_units_byte_count("æøå", 0, 6), ==, 0);
+
+  g_assert_cmpuint(strlen("🔥"), ==, 4); // Fire emoji
+
+  g_assert_cmpuint(utf16_code_units_byte_count("🔥", 0), ==, 0);           // Fire emoji
+  g_assert_cmpuint(utf16_code_units_byte_count("🔥", 2), ==, 4);           // Fire emoji
+  g_assert_cmpuint(utf16_code_units_byte_count("🔥", 3), ==, 4);           // Fire emoji
+  g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 0), ==, 0);   // Fire emoji between test words
+  g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 4), ==, 4);   // Fire emoji between test words
+  g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 6), ==, 8);   // Fire emoji between test words
+  g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 7), ==, 9);   // Fire emoji between test words
+  g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 10), ==, 12); // Fire emoji between test words
+  g_assert_cmpuint(utf16_code_units_byte_count("test🔥test", 11), ==, 12); // Fire emoji between test words
+
  std::string empty;
  std::string test("test");
  std::string testtest("testtest");