Skip to content

Commit

Permalink
Implement grapheme clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
lhecker committed Mar 21, 2024
1 parent 0a83946 commit 62b2cdb
Show file tree
Hide file tree
Showing 30 changed files with 2,964 additions and 335 deletions.
18 changes: 18 additions & 0 deletions .github/actions/spelling/expect/expect.txt
Expand Up @@ -144,6 +144,7 @@ bytebuffer
cac
cacafire
CALLCONV
CANDRABINDU
capslock
CARETBLINKINGENABLED
CARRIAGERETURN
Expand All @@ -155,6 +156,7 @@ cbiex
CBN
CBoolean
cbt
Ccc
CCCBB
cch
CCHAR
Expand All @@ -180,6 +182,7 @@ chaof
charinfo
CHARSETINFO
chh
chonker
chshdng
CHT
Cic
Expand Down Expand Up @@ -598,7 +601,9 @@ FEEF
fesb
FFAF
FFDE
FFFD
FFFDb
FFrom
fgbg
FGCOLOR
FGHIJ
Expand All @@ -617,6 +622,7 @@ FINDDOWN
FINDSTRINGEXACT
FINDUP
FIter
FITZPATRICK
FIXEDCONVERTED
FIXEDFILEINFO
Flg
Expand Down Expand Up @@ -888,11 +894,13 @@ jconcpp
JLO
JOBOBJECT
JOBOBJECTINFOCLASS
JONGSEONG
JPN
jsoncpp
Jsons
jsprovider
jumplist
JUNGSEONG
KAttrs
kawa
Kazu
Expand All @@ -911,6 +919,7 @@ keyups
KILLACTIVE
KILLFOCUS
kinda
KIYEOK
KLF
KLMNO
KLMNOPQRST
Expand Down Expand Up @@ -1020,6 +1029,7 @@ luma
lval
LVB
LVERTICAL
LVT
LWA
LWIN
lwkmvj
Expand Down Expand Up @@ -1049,6 +1059,7 @@ mdmerge
MDs
MEASUREITEM
megamix
Meh
memallocator
meme
MENUCHAR
Expand Down Expand Up @@ -1164,6 +1175,7 @@ NOMINMAX
NOMOVE
NONALERT
nonbreaking
noncharacter
nonclient
NONINFRINGEMENT
NONPREROTATED
Expand Down Expand Up @@ -1212,6 +1224,7 @@ ntuser
NTVDM
ntverp
nugetversions
NUKTA
nullness
nullonfailure
nullopts
Expand Down Expand Up @@ -1489,6 +1502,7 @@ renderengine
rendersize
reparented
reparenting
REPH
replatformed
Replymessage
repositorypath
Expand Down Expand Up @@ -1517,6 +1531,7 @@ rgw
RIGHTALIGN
RIGHTBUTTON
riid
ris
RIS
roadmap
robomac
Expand Down Expand Up @@ -1883,6 +1898,7 @@ UPDATEDISPLAY
UPDOWN
UPKEY
upss
UPSS

Check warning

Code scanning / check-spelling

Ignored Expect Variant Warning

UPSS is ignored by check spelling because another more general variant is also in expect. (ignored-expect-variant)
uregex
URegular
usebackq
Expand Down Expand Up @@ -1925,6 +1941,7 @@ vga
vgaoem
viewkind
viewports
VIRAMA
Virt
VIRTTERM
vkey
Expand Down Expand Up @@ -2165,6 +2182,7 @@ Zabcdefghijklmn
Zabcdefghijklmnopqrstuvwxyz
ZCmd
ZCtrl
ZWJs
zxcvbnm
ZYXWVU
ZYXWVUTd
5 changes: 5 additions & 0 deletions doc/cascadia/profiles.schema.json
Expand Up @@ -2344,6 +2344,11 @@
"description": "Force the terminal to use the legacy input encoding. Certain keys in some applications may stop working when enabling this setting.",
"type": "boolean"
},
"experimental.graphemes": {
"default": true,
"description": "When set to true, the terminal will use grapheme cluster boundaries for cursor movement. Otherwise, the terminal will use codepoint boundaries.",
"type": "boolean"
},
"experimental.useBackgroundImageForWindow": {
"default": false,
"description": "When set to true, the background image for the currently focused profile is expanded to encompass the entire window, beneath other panes.",
Expand Down
62 changes: 23 additions & 39 deletions src/buffer/out/Row.cpp
Expand Up @@ -5,10 +5,8 @@
#include "Row.hpp"

#include <isa_availability.h>
#include <til/unicode.h>

#include "textBuffer.hpp"
#include "../../types/inc/GlyphWidth.hpp"
#include "../../types/inc/CodepointWidthDetector.hpp"

// It would be nice to add checked array access in the future, but it's a little annoying to do so without impacting
// performance (including Debug performance). Other languages are a little bit more ergonomic there than C++.
Expand Down Expand Up @@ -646,60 +644,45 @@ catch (...)
//
// We can infer the "end" from the amount of columns we're given (colLimit - colBeg),
// because ASCII is always 1 column wide per character.
auto it = chars.begin();
const auto end = it + std::min<size_t>(chars.size(), colLimit - colBeg);
const auto len = std::min<size_t>(chars.size(), colLimit - colBeg);
size_t ch = chBeg;

while (it != end)
for (size_t off = 0; off < len; ++off)
{
if (*it >= 0x80) [[unlikely]]
if (chars[off] >= 0x80) [[unlikely]]
{
_replaceTextUnicode(ch, it);
_replaceTextUnicode(ch, off);
return;
}

til::at(row._charOffsets, colEnd) = gsl::narrow_cast<uint16_t>(ch);
++colEnd;
++ch;
++it;
}

colEndDirty = colEnd;
charsConsumed = ch - chBeg;
}

[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept
[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, size_t off)
{
const auto end = chars.end();
auto& cwd = CodepointWidthDetector::Singleton();
const auto len = chars.size();

while (it != end)
// The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
// In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
// and let MeasureNext() find the next proper grapheme boundary.
if (off != 0)
{
unsigned int width = 1;
auto ptr = &*it;
const auto wch = *ptr;
size_t advance = 1;

++it;

// Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
// It also allows us to skip the surrogate pair decoding at the same time.
if (wch >= 0x80)
{
if (til::is_surrogate(wch))
{
if (it != end && til::is_leading_surrogate(wch) && til::is_trailing_surrogate(*it))
{
advance = 2;
++it;
}
else
{
ptr = &UNICODE_REPLACEMENT;
}
}
--colEnd;
--ch;
--off;
}

width = IsGlyphFullWidth({ ptr, advance }) + 1u;
}
while (off < len)
{
int width;
const auto end = cwd.GraphemeNext(chars, off, &width);

const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
if (colEndNew > colLimit)
Expand All @@ -719,7 +702,8 @@ catch (...)
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
}

ch += advance;
ch += end - off;
off = end;
}

colEndDirty = colEnd;
Expand Down Expand Up @@ -1062,7 +1046,7 @@ std::wstring_view ROW::GetText() const noexcept

std::wstring_view ROW::GetText(til::CoordType columnBegin, til::CoordType columnEnd) const noexcept
{
const til::CoordType columns = _columnCount;
const auto columns = GetReadableColumnCount();
const auto colBeg = clamp(columnBegin, 0, columns);
const auto colEnd = clamp(columnEnd, colBeg, columns);
const size_t chBeg = _uncheckedCharOffset(gsl::narrow_cast<size_t>(colBeg));
Expand Down
2 changes: 1 addition & 1 deletion src/buffer/out/Row.hpp
Expand Up @@ -181,7 +181,7 @@ class ROW final
bool IsValid() const noexcept;
void ReplaceCharacters(til::CoordType width) noexcept;
void ReplaceText() noexcept;
void _replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept;
void _replaceTextUnicode(size_t ch, size_t off);
void CopyTextFrom(const std::span<const uint16_t>& charOffsets) noexcept;
static void _copyOffsets(uint16_t* dst, const uint16_t* src, uint16_t size, uint16_t offset) noexcept;
void Finish();
Expand Down
69 changes: 26 additions & 43 deletions src/buffer/out/textBuffer.cpp
Expand Up @@ -2,16 +2,13 @@
// Licensed under the MIT license.

#include "precomp.h"

#include "textBuffer.hpp"

#include <til/hash.h>
#include <til/unicode.h>

#include "UTextAdapter.h"
#include "../../types/inc/GlyphWidth.hpp"
#include "../../types/inc/CodepointWidthDetector.hpp"
#include "../renderer/base/renderer.hpp"
#include "../types/inc/convert.hpp"
#include "../types/inc/utils.hpp"

using namespace Microsoft::Console;
Expand Down Expand Up @@ -408,17 +405,17 @@ void TextBuffer::_PrepareForDoubleByteSequence(const DbcsAttribute dbcsAttribute
// Given the character offset `position` in the `chars` string, this function returns the starting position of the next grapheme.
// For instance, given a `chars` of L"x\uD83D\uDE42y" and a `position` of 1 it'll return 3.
// GraphemePrev would do the exact inverse of this operation.
// In the future, these functions are expected to also deliver information about how many columns a grapheme occupies.
// (I know that mere UTF-16 code point iteration doesn't handle graphemes, but that's what we're working towards.)
size_t TextBuffer::GraphemeNext(const std::wstring_view& chars, size_t position) noexcept
{
return til::utf16_iterate_next(chars, position);
auto& cwd = CodepointWidthDetector::Singleton();
return cwd.GraphemeNext(chars, position, nullptr);
}

// It's the counterpart to GraphemeNext. See GraphemeNext.
size_t TextBuffer::GraphemePrev(const std::wstring_view& chars, size_t position) noexcept
{
return til::utf16_iterate_prev(chars, position);
auto& cwd = CodepointWidthDetector::Singleton();
return cwd.GraphemePrev(chars, position, nullptr);
}

// Ever wondered how much space a piece of text needs before inserting it? This function will tell you!
Expand All @@ -445,7 +442,7 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
{
}

const auto dist = gsl::narrow_cast<size_t>(it - beg);
auto dist = gsl::narrow_cast<size_t>(it - beg);
auto col = gsl::narrow_cast<til::CoordType>(dist);

if (it == asciiEnd) [[likely]]
Expand All @@ -455,33 +452,23 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
}

// Unicode slow-path where we need to count text and columns separately.
for (;;)
{
auto ptr = &*it;
const auto wch = *ptr;
size_t len = 1;

col++;
auto& cwd = CodepointWidthDetector::Singleton();
const auto len = chars.size();

// Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
// It also allows us to skip the surrogate pair decoding at the same time.
if (wch >= 0x80)
{
if (til::is_surrogate(wch))
{
const auto it2 = it + 1;
if (til::is_leading_surrogate(wch) && it2 != end && til::is_trailing_surrogate(*it2))
{
len = 2;
}
else
{
ptr = &UNICODE_REPLACEMENT;
}
}
// The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
// In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
// and let GraphemeNext() find the next proper grapheme boundary.
if (dist != 0)
{
dist--;
col--;
}

col += IsGlyphFullWidth({ ptr, len });
}
while (dist < len)
{
int width;
dist = cwd.GraphemeNext(chars, dist, &width);
col += width;

// If we ran out of columns, we need to always return `columnLimit` and not `cols`,
// because if we tried inserting a wide glyph into just 1 remaining column it will
Expand All @@ -490,17 +477,13 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
if (col > columnLimit)
{
columns = columnLimit;
return gsl::narrow_cast<size_t>(it - beg);
}

// But if we simply ran out of text we just need to return the actual number of columns.
it += len;
if (it == end)
{
columns = col;
return chars.size();
return dist;
}
}

// But if we simply ran out of text we just need to return the actual number of columns.
columns = col;
return chars.size();
}

// Pretend as if `position` is a regular cursor in the TextBuffer.
Expand Down
1 change: 1 addition & 0 deletions src/cascadia/TerminalCore/ICoreSettings.idl
Expand Up @@ -20,6 +20,7 @@ namespace Microsoft.Terminal.Core
String WordDelimiters;

Boolean ForceVTInput;
Boolean Graphemes;
Boolean TrimBlockSelection;
Boolean DetectURLs;
Boolean VtPassthrough;
Expand Down

0 comments on commit 62b2cdb

Please sign in to comment.