Implement grapheme clusters

microsoft · Mar 21, 2024 · 62b2cdb · 62b2cdb
1 parent 0a83946
commit 62b2cdb
Show file tree

Hide file tree

Showing 30 changed files with 2,964 additions and 335 deletions.
diff --git a/.github/actions/spelling/expect/expect.txt b/.github/actions/spelling/expect/expect.txt
@@ -144,6 +144,7 @@ bytebuffer
 cac
 cacafire
 CALLCONV
+CANDRABINDU
 capslock
 CARETBLINKINGENABLED
 CARRIAGERETURN
@@ -155,6 +156,7 @@ cbiex
 CBN
 CBoolean
 cbt
+Ccc
 CCCBB
 cch
 CCHAR
@@ -180,6 +182,7 @@ chaof
 charinfo
 CHARSETINFO
 chh
+chonker
 chshdng
 CHT
 Cic
@@ -598,7 +601,9 @@ FEEF
 fesb
 FFAF
 FFDE
+FFFD
 FFFDb
+FFrom
 fgbg
 FGCOLOR
 FGHIJ
@@ -617,6 +622,7 @@ FINDDOWN
 FINDSTRINGEXACT
 FINDUP
 FIter
+FITZPATRICK
 FIXEDCONVERTED
 FIXEDFILEINFO
 Flg
@@ -888,11 +894,13 @@ jconcpp
 JLO
 JOBOBJECT
 JOBOBJECTINFOCLASS
+JONGSEONG
 JPN
 jsoncpp
 Jsons
 jsprovider
 jumplist
+JUNGSEONG
 KAttrs
 kawa
 Kazu
@@ -911,6 +919,7 @@ keyups
 KILLACTIVE
 KILLFOCUS
 kinda
+KIYEOK
 KLF
 KLMNO
 KLMNOPQRST
@@ -1020,6 +1029,7 @@ luma
 lval
 LVB
 LVERTICAL
+LVT
 LWA
 LWIN
 lwkmvj
@@ -1049,6 +1059,7 @@ mdmerge
 MDs
 MEASUREITEM
 megamix
+Meh
 memallocator
 meme
 MENUCHAR
@@ -1164,6 +1175,7 @@ NOMINMAX
 NOMOVE
 NONALERT
 nonbreaking
+noncharacter
 nonclient
 NONINFRINGEMENT
 NONPREROTATED
@@ -1212,6 +1224,7 @@ ntuser
 NTVDM
 ntverp
 nugetversions
+NUKTA
 nullness
 nullonfailure
 nullopts
@@ -1489,6 +1502,7 @@ renderengine
 rendersize
 reparented
 reparenting
+REPH
 replatformed
 Replymessage
 repositorypath
@@ -1517,6 +1531,7 @@ rgw
 RIGHTALIGN
 RIGHTBUTTON
 riid
+ris
 RIS
 roadmap
 robomac
@@ -1883,6 +1898,7 @@ UPDATEDISPLAY
 UPDOWN
 UPKEY
 upss
+UPSS
 uregex
 URegular
 usebackq
@@ -1925,6 +1941,7 @@ vga
 vgaoem
 viewkind
 viewports
+VIRAMA
 Virt
 VIRTTERM
 vkey
@@ -2165,6 +2182,7 @@ Zabcdefghijklmn
 Zabcdefghijklmnopqrstuvwxyz
 ZCmd
 ZCtrl
+ZWJs
 zxcvbnm
 ZYXWVU
 ZYXWVUTd
diff --git a/doc/cascadia/profiles.schema.json b/doc/cascadia/profiles.schema.json
@@ -2344,6 +2344,11 @@
  "description": "Force the terminal to use the legacy input encoding. Certain keys in some applications may stop working when enabling this setting.",
  "type": "boolean"
  },
+ "experimental.graphemes": {
+ "default": true,
+ "description": "When set to true, the terminal will use grapheme cluster boundaries for cursor movement. Otherwise, the terminal will use codepoint boundaries.",
+ "type": "boolean"
+ },
  "experimental.useBackgroundImageForWindow": {
  "default": false,
  "description": "When set to true, the background image for the currently focused profile is expanded to encompass the entire window, beneath other panes.",

diff --git a/src/buffer/out/Row.cpp b/src/buffer/out/Row.cpp
@@ -5,10 +5,8 @@
 #include "Row.hpp"
 
 #include <isa_availability.h>
-#include <til/unicode.h>
 
-#include "textBuffer.hpp"
-#include "../../types/inc/GlyphWidth.hpp"
+#include "../../types/inc/CodepointWidthDetector.hpp"
 
 // It would be nice to add checked array access in the future, but it's a little annoying to do so without impacting
 // performance (including Debug performance). Other languages are a little bit more ergonomic there than C++.
@@ -646,60 +644,45 @@ catch (...)
  //
  // We can infer the "end" from the amount of columns we're given (colLimit - colBeg),
  // because ASCII is always 1 column wide per character.
- auto it = chars.begin();
- const auto end = it + std::min<size_t>(chars.size(), colLimit - colBeg);
+ const auto len = std::min<size_t>(chars.size(), colLimit - colBeg);
  size_t ch = chBeg;
 
- while (it != end)
+ for (size_t off = 0; off < len; ++off)
  {
- if (*it >= 0x80) [[unlikely]]
+ if (chars[off] >= 0x80) [[unlikely]]
  {
- _replaceTextUnicode(ch, it);
+ _replaceTextUnicode(ch, off);
  return;
  }
 
  til::at(row._charOffsets, colEnd) = gsl::narrow_cast<uint16_t>(ch);
  ++colEnd;
  ++ch;
- ++it;
  }
 
  colEndDirty = colEnd;
  charsConsumed = ch - chBeg;
 }
 
-[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept
+[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, size_t off)
 {
- const auto end = chars.end();
+ auto& cwd = CodepointWidthDetector::Singleton();
+ const auto len = chars.size();
 
- while (it != end)
+ // The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
+ // In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
+ // and let MeasureNext() find the next proper grapheme boundary.
+ if (off != 0)
  {
- unsigned int width = 1;
- auto ptr = &*it;
- const auto wch = *ptr;
- size_t advance = 1;
-
- ++it;
-
- // Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
- // It also allows us to skip the surrogate pair decoding at the same time.
- if (wch >= 0x80)
- {
- if (til::is_surrogate(wch))
- {
- if (it != end && til::is_leading_surrogate(wch) && til::is_trailing_surrogate(*it))
- {
- advance = 2;
- ++it;
- }
- else
- {
- ptr = &UNICODE_REPLACEMENT;
- }
- }
+ --colEnd;
+ --ch;
+ --off;
+ }
 
- width = IsGlyphFullWidth({ ptr, advance }) + 1u;
- }
+ while (off < len)
+ {
+ int width;
+ const auto end = cwd.GraphemeNext(chars, off, &width);
 
  const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
  if (colEndNew > colLimit)
@@ -719,7 +702,8 @@ catch (...)
  til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
  }
 
- ch += advance;
+ ch += end - off;
+ off = end;
  }
 
  colEndDirty = colEnd;
@@ -1062,7 +1046,7 @@ std::wstring_view ROW::GetText() const noexcept
 
 std::wstring_view ROW::GetText(til::CoordType columnBegin, til::CoordType columnEnd) const noexcept
 {
- const til::CoordType columns = _columnCount;
+ const auto columns = GetReadableColumnCount();
  const auto colBeg = clamp(columnBegin, 0, columns);
  const auto colEnd = clamp(columnEnd, colBeg, columns);
  const size_t chBeg = _uncheckedCharOffset(gsl::narrow_cast<size_t>(colBeg));

diff --git a/src/buffer/out/Row.hpp b/src/buffer/out/Row.hpp
@@ -181,7 +181,7 @@ class ROW final
  bool IsValid() const noexcept;
  void ReplaceCharacters(til::CoordType width) noexcept;
  void ReplaceText() noexcept;
- void _replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept;
+ void _replaceTextUnicode(size_t ch, size_t off);
  void CopyTextFrom(const std::span<const uint16_t>& charOffsets) noexcept;
  static void _copyOffsets(uint16_t* dst, const uint16_t* src, uint16_t size, uint16_t offset) noexcept;
  void Finish();

diff --git a/src/buffer/out/textBuffer.cpp b/src/buffer/out/textBuffer.cpp
@@ -2,16 +2,13 @@
 // Licensed under the MIT license.
 
 #include "precomp.h"
-
 #include "textBuffer.hpp"
 
 #include <til/hash.h>
-#include <til/unicode.h>
 
 #include "UTextAdapter.h"
-#include "../../types/inc/GlyphWidth.hpp"
+#include "../../types/inc/CodepointWidthDetector.hpp"
 #include "../renderer/base/renderer.hpp"
-#include "../types/inc/convert.hpp"
 #include "../types/inc/utils.hpp"
 
 using namespace Microsoft::Console;
@@ -408,17 +405,17 @@ void TextBuffer::_PrepareForDoubleByteSequence(const DbcsAttribute dbcsAttribute
 // Given the character offset `position` in the `chars` string, this function returns the starting position of the next grapheme.
 // For instance, given a `chars` of L"x\uD83D\uDE42y" and a `position` of 1 it'll return 3.
 // GraphemePrev would do the exact inverse of this operation.
-// In the future, these functions are expected to also deliver information about how many columns a grapheme occupies.
-// (I know that mere UTF-16 code point iteration doesn't handle graphemes, but that's what we're working towards.)
 size_t TextBuffer::GraphemeNext(const std::wstring_view& chars, size_t position) noexcept
 {
- return til::utf16_iterate_next(chars, position);
+ auto& cwd = CodepointWidthDetector::Singleton();
+ return cwd.GraphemeNext(chars, position, nullptr);
 }
 
 // It's the counterpart to GraphemeNext. See GraphemeNext.
 size_t TextBuffer::GraphemePrev(const std::wstring_view& chars, size_t position) noexcept
 {
- return til::utf16_iterate_prev(chars, position);
+ auto& cwd = CodepointWidthDetector::Singleton();
+ return cwd.GraphemePrev(chars, position, nullptr);
 }
 
 // Ever wondered how much space a piece of text needs before inserting it? This function will tell you!
@@ -445,7 +442,7 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
  {
  }
 
- const auto dist = gsl::narrow_cast<size_t>(it - beg);
+ auto dist = gsl::narrow_cast<size_t>(it - beg);
  auto col = gsl::narrow_cast<til::CoordType>(dist);
 
  if (it == asciiEnd) [[likely]]
@@ -455,33 +452,23 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
  }
 
  // Unicode slow-path where we need to count text and columns separately.
- for (;;)
- {
- auto ptr = &*it;
- const auto wch = *ptr;
- size_t len = 1;
-
- col++;
+ auto& cwd = CodepointWidthDetector::Singleton();
+ const auto len = chars.size();
 
- // Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
- // It also allows us to skip the surrogate pair decoding at the same time.
- if (wch >= 0x80)
- {
- if (til::is_surrogate(wch))
- {
- const auto it2 = it + 1;
- if (til::is_leading_surrogate(wch) && it2 != end && til::is_trailing_surrogate(*it2))
- {
- len = 2;
- }
- else
- {
- ptr = &UNICODE_REPLACEMENT;
- }
- }
+ // The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
+ // In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
+ // and let GraphemeNext() find the next proper grapheme boundary.
+ if (dist != 0)
+ {
+ dist--;
+ col--;
+ }
 
- col += IsGlyphFullWidth({ ptr, len });
- }
+ while (dist < len)
+ {
+ int width;
+ dist = cwd.GraphemeNext(chars, dist, &width);
+ col += width;
 
  // If we ran out of columns, we need to always return `columnLimit` and not `cols`,
  // because if we tried inserting a wide glyph into just 1 remaining column it will
@@ -490,17 +477,13 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
  if (col > columnLimit)
  {
  columns = columnLimit;
- return gsl::narrow_cast<size_t>(it - beg);
- }
-
- // But if we simply ran out of text we just need to return the actual number of columns.
- it += len;
- if (it == end)
- {
- columns = col;
- return chars.size();
+ return dist;
  }
  }
+
+ // But if we simply ran out of text we just need to return the actual number of columns.
+ columns = col;
+ return chars.size();
 }
 
 // Pretend as if `position` is a regular cursor in the TextBuffer.

diff --git a/src/cascadia/TerminalCore/ICoreSettings.idl b/src/cascadia/TerminalCore/ICoreSettings.idl
@@ -20,6 +20,7 @@ namespace Microsoft.Terminal.Core
  String WordDelimiters;
 
  Boolean ForceVTInput;
+ Boolean Graphemes;
  Boolean TrimBlockSelection;
  Boolean DetectURLs;
  Boolean VtPassthrough;