Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement grapheme clusters #16916

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 18 additions & 0 deletions .github/actions/spelling/expect/expect.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@
cac
cacafire
CALLCONV
CANDRABINDU
capslock
CARETBLINKINGENABLED
CARRIAGERETURN
Expand All @@ -155,6 +156,7 @@
CBN
CBoolean
cbt
Ccc
CCCBB
cch
CCHAR
Expand All @@ -180,6 +182,7 @@
charinfo
CHARSETINFO
chh
chonker
chshdng
CHT
Cic
Expand Down Expand Up @@ -598,7 +601,9 @@
fesb
FFAF
FFDE
FFFD
FFFDb
FFrom
fgbg
FGCOLOR
FGHIJ
Expand All @@ -617,6 +622,7 @@
FINDSTRINGEXACT
FINDUP
FIter
FITZPATRICK
FIXEDCONVERTED
FIXEDFILEINFO
Flg
Expand Down Expand Up @@ -888,11 +894,13 @@
JLO
JOBOBJECT
JOBOBJECTINFOCLASS
JONGSEONG
JPN
jsoncpp
Jsons
jsprovider
jumplist
JUNGSEONG
KAttrs
kawa
Kazu
Expand All @@ -911,6 +919,7 @@
KILLACTIVE
KILLFOCUS
kinda
KIYEOK
KLF
KLMNO
KLMNOPQRST
Expand Down Expand Up @@ -1020,6 +1029,7 @@
lval
LVB
LVERTICAL
LVT
LWA
LWIN
lwkmvj
Expand Down Expand Up @@ -1049,6 +1059,7 @@
MDs
MEASUREITEM
megamix
Meh
memallocator
meme
MENUCHAR
Expand Down Expand Up @@ -1164,6 +1175,7 @@
NOMOVE
NONALERT
nonbreaking
noncharacter
nonclient
NONINFRINGEMENT
NONPREROTATED
Expand Down Expand Up @@ -1212,6 +1224,7 @@
NTVDM
ntverp
nugetversions
NUKTA
nullness
nullonfailure
nullopts
Expand Down Expand Up @@ -1489,6 +1502,7 @@
rendersize
reparented
reparenting
REPH
replatformed
Replymessage
repositorypath
Expand Down Expand Up @@ -1517,6 +1531,7 @@
RIGHTALIGN
RIGHTBUTTON
riid
ris
RIS
roadmap
robomac
Expand Down Expand Up @@ -1883,6 +1898,7 @@
UPDOWN
UPKEY
upss
UPSS
Fixed Show fixed Hide fixed
uregex
URegular
usebackq
Expand Down Expand Up @@ -1925,6 +1941,7 @@
vgaoem
viewkind
viewports
VIRAMA
Virt
VIRTTERM
vkey
Expand Down Expand Up @@ -2165,6 +2182,7 @@
Zabcdefghijklmnopqrstuvwxyz
ZCmd
ZCtrl
ZWJs
zxcvbnm
ZYXWVU
ZYXWVUTd
5 changes: 5 additions & 0 deletions doc/cascadia/profiles.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -2344,6 +2344,11 @@
"description": "Force the terminal to use the legacy input encoding. Certain keys in some applications may stop working when enabling this setting.",
"type": "boolean"
},
"experimental.graphemes": {
"default": true,
"description": "When set to true, the terminal will use grapheme cluster boundaries for cursor movement. Otherwise, the terminal will use codepoint boundaries.",
"type": "boolean"
},
"experimental.useBackgroundImageForWindow": {
"default": false,
"description": "When set to true, the background image for the currently focused profile is expanded to encompass the entire window, beneath other panes.",
Expand Down
62 changes: 23 additions & 39 deletions src/buffer/out/Row.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@
#include "Row.hpp"

#include <isa_availability.h>
#include <til/unicode.h>

#include "textBuffer.hpp"
#include "../../types/inc/GlyphWidth.hpp"
#include "../../types/inc/CodepointWidthDetector.hpp"

// It would be nice to add checked array access in the future, but it's a little annoying to do so without impacting
// performance (including Debug performance). Other languages are a little bit more ergonomic there than C++.
Expand Down Expand Up @@ -646,60 +644,45 @@ catch (...)
//
// We can infer the "end" from the amount of columns we're given (colLimit - colBeg),
// because ASCII is always 1 column wide per character.
auto it = chars.begin();
const auto end = it + std::min<size_t>(chars.size(), colLimit - colBeg);
const auto len = std::min<size_t>(chars.size(), colLimit - colBeg);
size_t ch = chBeg;

while (it != end)
for (size_t off = 0; off < len; ++off)
{
if (*it >= 0x80) [[unlikely]]
if (chars[off] >= 0x80) [[unlikely]]
{
_replaceTextUnicode(ch, it);
_replaceTextUnicode(ch, off);
return;
}

til::at(row._charOffsets, colEnd) = gsl::narrow_cast<uint16_t>(ch);
++colEnd;
++ch;
++it;
}

colEndDirty = colEnd;
charsConsumed = ch - chBeg;
}

[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept
[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, size_t off)
{
const auto end = chars.end();
auto& cwd = CodepointWidthDetector::Singleton();
const auto len = chars.size();

while (it != end)
// The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
// In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
// and let MeasureNext() find the next proper grapheme boundary.
if (off != 0)
{
unsigned int width = 1;
auto ptr = &*it;
const auto wch = *ptr;
size_t advance = 1;

++it;

// Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
// It also allows us to skip the surrogate pair decoding at the same time.
if (wch >= 0x80)
{
if (til::is_surrogate(wch))
{
if (it != end && til::is_leading_surrogate(wch) && til::is_trailing_surrogate(*it))
{
advance = 2;
++it;
}
else
{
ptr = &UNICODE_REPLACEMENT;
}
}
--colEnd;
--ch;
--off;
}

width = IsGlyphFullWidth({ ptr, advance }) + 1u;
}
while (off < len)
{
int width;
const auto end = cwd.GraphemeNext(chars, off, &width);

const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
if (colEndNew > colLimit)
Expand All @@ -719,7 +702,8 @@ catch (...)
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
}

ch += advance;
ch += end - off;
off = end;
}

colEndDirty = colEnd;
Expand Down Expand Up @@ -1062,7 +1046,7 @@ std::wstring_view ROW::GetText() const noexcept

std::wstring_view ROW::GetText(til::CoordType columnBegin, til::CoordType columnEnd) const noexcept
{
const til::CoordType columns = _columnCount;
const auto columns = GetReadableColumnCount();
const auto colBeg = clamp(columnBegin, 0, columns);
const auto colEnd = clamp(columnEnd, colBeg, columns);
const size_t chBeg = _uncheckedCharOffset(gsl::narrow_cast<size_t>(colBeg));
Expand Down
2 changes: 1 addition & 1 deletion src/buffer/out/Row.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ class ROW final
bool IsValid() const noexcept;
void ReplaceCharacters(til::CoordType width) noexcept;
void ReplaceText() noexcept;
void _replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept;
void _replaceTextUnicode(size_t ch, size_t off);
void CopyTextFrom(const std::span<const uint16_t>& charOffsets) noexcept;
static void _copyOffsets(uint16_t* dst, const uint16_t* src, uint16_t size, uint16_t offset) noexcept;
void Finish();
Expand Down
69 changes: 26 additions & 43 deletions src/buffer/out/textBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,13 @@
// Licensed under the MIT license.

#include "precomp.h"

#include "textBuffer.hpp"

#include <til/hash.h>
#include <til/unicode.h>

#include "UTextAdapter.h"
#include "../../types/inc/GlyphWidth.hpp"
#include "../../types/inc/CodepointWidthDetector.hpp"
#include "../renderer/base/renderer.hpp"
#include "../types/inc/convert.hpp"
#include "../types/inc/utils.hpp"

using namespace Microsoft::Console;
Expand Down Expand Up @@ -408,17 +405,17 @@ void TextBuffer::_PrepareForDoubleByteSequence(const DbcsAttribute dbcsAttribute
// Given the character offset `position` in the `chars` string, this function returns the starting position of the next grapheme.
// For instance, given a `chars` of L"x\uD83D\uDE42y" and a `position` of 1 it'll return 3.
// GraphemePrev would do the exact inverse of this operation.
// In the future, these functions are expected to also deliver information about how many columns a grapheme occupies.
// (I know that mere UTF-16 code point iteration doesn't handle graphemes, but that's what we're working towards.)
size_t TextBuffer::GraphemeNext(const std::wstring_view& chars, size_t position) noexcept
{
return til::utf16_iterate_next(chars, position);
auto& cwd = CodepointWidthDetector::Singleton();
return cwd.GraphemeNext(chars, position, nullptr);
}

// It's the counterpart to GraphemeNext. See GraphemeNext.
size_t TextBuffer::GraphemePrev(const std::wstring_view& chars, size_t position) noexcept
{
return til::utf16_iterate_prev(chars, position);
auto& cwd = CodepointWidthDetector::Singleton();
return cwd.GraphemePrev(chars, position, nullptr);
}

// Ever wondered how much space a piece of text needs before inserting it? This function will tell you!
Expand All @@ -445,7 +442,7 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
{
}

const auto dist = gsl::narrow_cast<size_t>(it - beg);
auto dist = gsl::narrow_cast<size_t>(it - beg);
auto col = gsl::narrow_cast<til::CoordType>(dist);

if (it == asciiEnd) [[likely]]
Expand All @@ -455,33 +452,23 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
}

// Unicode slow-path where we need to count text and columns separately.
for (;;)
{
auto ptr = &*it;
const auto wch = *ptr;
size_t len = 1;

col++;
auto& cwd = CodepointWidthDetector::Singleton();
const auto len = chars.size();

// Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
// It also allows us to skip the surrogate pair decoding at the same time.
if (wch >= 0x80)
{
if (til::is_surrogate(wch))
{
const auto it2 = it + 1;
if (til::is_leading_surrogate(wch) && it2 != end && til::is_trailing_surrogate(*it2))
{
len = 2;
}
else
{
ptr = &UNICODE_REPLACEMENT;
}
}
// The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
// In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
// and let GraphemeNext() find the next proper grapheme boundary.
if (dist != 0)
{
dist--;
col--;
}

col += IsGlyphFullWidth({ ptr, len });
}
while (dist < len)
{
int width;
dist = cwd.GraphemeNext(chars, dist, &width);
col += width;

// If we ran out of columns, we need to always return `columnLimit` and not `cols`,
// because if we tried inserting a wide glyph into just 1 remaining column it will
Expand All @@ -490,17 +477,13 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
if (col > columnLimit)
{
columns = columnLimit;
return gsl::narrow_cast<size_t>(it - beg);
}

// But if we simply ran out of text we just need to return the actual number of columns.
it += len;
if (it == end)
{
columns = col;
return chars.size();
return dist;
}
}

// But if we simply ran out of text we just need to return the actual number of columns.
columns = col;
return chars.size();
}

// Pretend as if `position` is a regular cursor in the TextBuffer.
Expand Down
1 change: 1 addition & 0 deletions src/cascadia/TerminalCore/ICoreSettings.idl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ namespace Microsoft.Terminal.Core
String WordDelimiters;

Boolean ForceVTInput;
Boolean Graphemes;
Boolean TrimBlockSelection;
Boolean DetectURLs;
Boolean VtPassthrough;
Expand Down