223 lines
7.0 KiB
TypeScript
223 lines
7.0 KiB
TypeScript
import emojiRegex from 'emoji-regex'
|
|
import { eastAsianWidth } from 'get-east-asian-width'
|
|
import stripAnsi from 'strip-ansi'
|
|
import { getGraphemeSegmenter } from '../utils/intl.js'
|
|
|
|
const EMOJI_REGEX = emojiRegex()
|
|
|
|
/**
|
|
* Fallback JavaScript implementation of stringWidth when Bun.stringWidth is not available.
|
|
*
|
|
* Get the display width of a string as it would appear in a terminal.
|
|
*
|
|
* This is a more accurate alternative to the string-width package that correctly handles
|
|
* characters like ⚠ (U+26A0) which string-width incorrectly reports as width 2.
|
|
*
|
|
* The implementation uses eastAsianWidth directly with ambiguousAsWide: false,
|
|
* which correctly treats ambiguous-width characters as narrow (width 1) as
|
|
* recommended by the Unicode standard for Western contexts.
|
|
*/
|
|
function stringWidthJavaScript(str: string): number {
|
|
if (typeof str !== 'string' || str.length === 0) {
|
|
return 0
|
|
}
|
|
|
|
// Fast path: pure ASCII string (no ANSI codes, no wide chars)
|
|
let isPureAscii = true
|
|
for (let i = 0; i < str.length; i++) {
|
|
const code = str.charCodeAt(i)
|
|
// Check for non-ASCII or ANSI escape (0x1b)
|
|
if (code >= 127 || code === 0x1b) {
|
|
isPureAscii = false
|
|
break
|
|
}
|
|
}
|
|
if (isPureAscii) {
|
|
// Count printable characters (exclude control chars)
|
|
let width = 0
|
|
for (let i = 0; i < str.length; i++) {
|
|
const code = str.charCodeAt(i)
|
|
if (code > 0x1f) {
|
|
width++
|
|
}
|
|
}
|
|
return width
|
|
}
|
|
|
|
// Strip ANSI if escape character is present
|
|
if (str.includes('\x1b')) {
|
|
str = stripAnsi(str)
|
|
if (str.length === 0) {
|
|
return 0
|
|
}
|
|
}
|
|
|
|
// Fast path: simple Unicode (no emoji, variation selectors, or joiners)
|
|
if (!needsSegmentation(str)) {
|
|
let width = 0
|
|
for (const char of str) {
|
|
const codePoint = char.codePointAt(0)!
|
|
if (!isZeroWidth(codePoint)) {
|
|
width += eastAsianWidth(codePoint, { ambiguousAsWide: false })
|
|
}
|
|
}
|
|
return width
|
|
}
|
|
|
|
let width = 0
|
|
|
|
for (const { segment: grapheme } of getGraphemeSegmenter().segment(str)) {
|
|
// Check for emoji first (most emoji sequences are width 2)
|
|
EMOJI_REGEX.lastIndex = 0
|
|
if (EMOJI_REGEX.test(grapheme)) {
|
|
width += getEmojiWidth(grapheme)
|
|
continue
|
|
}
|
|
|
|
// Calculate width for non-emoji graphemes
|
|
// For grapheme clusters (like Devanagari conjuncts with virama+ZWJ), only count
|
|
// the first non-zero-width character's width since the cluster renders as one glyph
|
|
for (const char of grapheme) {
|
|
const codePoint = char.codePointAt(0)!
|
|
if (!isZeroWidth(codePoint)) {
|
|
width += eastAsianWidth(codePoint, { ambiguousAsWide: false })
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
return width
|
|
}
|
|
|
|
function needsSegmentation(str: string): boolean {
|
|
for (const char of str) {
|
|
const cp = char.codePointAt(0)!
|
|
// Emoji ranges
|
|
if (cp >= 0x1f300 && cp <= 0x1faff) return true
|
|
if (cp >= 0x2600 && cp <= 0x27bf) return true
|
|
if (cp >= 0x1f1e6 && cp <= 0x1f1ff) return true
|
|
// Variation selectors, ZWJ
|
|
if (cp >= 0xfe00 && cp <= 0xfe0f) return true
|
|
if (cp === 0x200d) return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
function getEmojiWidth(grapheme: string): number {
|
|
// Regional indicators: single = 1, pair = 2
|
|
const first = grapheme.codePointAt(0)!
|
|
if (first >= 0x1f1e6 && first <= 0x1f1ff) {
|
|
let count = 0
|
|
for (const _ of grapheme) count++
|
|
return count === 1 ? 1 : 2
|
|
}
|
|
|
|
// Incomplete keycap: digit/symbol + VS16 without U+20E3
|
|
if (grapheme.length === 2) {
|
|
const second = grapheme.codePointAt(1)
|
|
if (
|
|
second === 0xfe0f &&
|
|
((first >= 0x30 && first <= 0x39) || first === 0x23 || first === 0x2a)
|
|
) {
|
|
return 1
|
|
}
|
|
}
|
|
|
|
return 2
|
|
}
|
|
|
|
function isZeroWidth(codePoint: number): boolean {
|
|
// Fast path for common printable range
|
|
if (codePoint >= 0x20 && codePoint < 0x7f) return false
|
|
if (codePoint >= 0xa0 && codePoint < 0x0300) return codePoint === 0x00ad
|
|
|
|
// Control characters
|
|
if (codePoint <= 0x1f || (codePoint >= 0x7f && codePoint <= 0x9f)) return true
|
|
|
|
// Zero-width and invisible characters
|
|
if (
|
|
(codePoint >= 0x200b && codePoint <= 0x200d) || // ZW space/joiner
|
|
codePoint === 0xfeff || // BOM
|
|
(codePoint >= 0x2060 && codePoint <= 0x2064) // Word joiner etc.
|
|
) {
|
|
return true
|
|
}
|
|
|
|
// Variation selectors
|
|
if (
|
|
(codePoint >= 0xfe00 && codePoint <= 0xfe0f) ||
|
|
(codePoint >= 0xe0100 && codePoint <= 0xe01ef)
|
|
) {
|
|
return true
|
|
}
|
|
|
|
// Combining diacritical marks
|
|
if (
|
|
(codePoint >= 0x0300 && codePoint <= 0x036f) ||
|
|
(codePoint >= 0x1ab0 && codePoint <= 0x1aff) ||
|
|
(codePoint >= 0x1dc0 && codePoint <= 0x1dff) ||
|
|
(codePoint >= 0x20d0 && codePoint <= 0x20ff) ||
|
|
(codePoint >= 0xfe20 && codePoint <= 0xfe2f)
|
|
) {
|
|
return true
|
|
}
|
|
|
|
// Indic script combining marks (covers Devanagari through Malayalam)
|
|
if (codePoint >= 0x0900 && codePoint <= 0x0d4f) {
|
|
// Signs and vowel marks at start of each script block
|
|
const offset = codePoint & 0x7f
|
|
if (offset <= 0x03) return true // Signs at block start
|
|
if (offset >= 0x3a && offset <= 0x4f) return true // Vowel signs, virama
|
|
if (offset >= 0x51 && offset <= 0x57) return true // Stress signs
|
|
if (offset >= 0x62 && offset <= 0x63) return true // Vowel signs
|
|
}
|
|
|
|
// Thai/Lao combining marks
|
|
// Note: U+0E32 (SARA AA), U+0E33 (SARA AM), U+0EB2, U+0EB3 are spacing vowels (width 1), not combining marks
|
|
if (
|
|
codePoint === 0x0e31 || // Thai MAI HAN-AKAT
|
|
(codePoint >= 0x0e34 && codePoint <= 0x0e3a) || // Thai vowel signs (skip U+0E32, U+0E33)
|
|
(codePoint >= 0x0e47 && codePoint <= 0x0e4e) || // Thai vowel signs and marks
|
|
codePoint === 0x0eb1 || // Lao MAI KAN
|
|
(codePoint >= 0x0eb4 && codePoint <= 0x0ebc) || // Lao vowel signs (skip U+0EB2, U+0EB3)
|
|
(codePoint >= 0x0ec8 && codePoint <= 0x0ecd) // Lao tone marks
|
|
) {
|
|
return true
|
|
}
|
|
|
|
// Arabic formatting
|
|
if (
|
|
(codePoint >= 0x0600 && codePoint <= 0x0605) ||
|
|
codePoint === 0x06dd ||
|
|
codePoint === 0x070f ||
|
|
codePoint === 0x08e2
|
|
) {
|
|
return true
|
|
}
|
|
|
|
// Surrogates, tag characters
|
|
if (codePoint >= 0xd800 && codePoint <= 0xdfff) return true
|
|
if (codePoint >= 0xe0000 && codePoint <= 0xe007f) return true
|
|
|
|
return false
|
|
}
|
|
|
|
// Note: complex-script graphemes like Devanagari क्ष (ka+virama+ZWJ+ssa) render
|
|
// as a single ligature glyph but occupy 2 terminal cells (wcwidth sums the base
|
|
// consonants). Bun.stringWidth=2 matches terminal cell allocation, which is what
|
|
// we need for cursor positioning — the JS fallback's grapheme-cluster width of 1
|
|
// would desync Ink's layout from the terminal.
|
|
//
|
|
// Bun.stringWidth is resolved once at module scope rather than checked on every
|
|
// call — typeof guards deopt property access and this is a hot path (~100k calls/frame).
|
|
const bunStringWidth =
|
|
typeof Bun !== 'undefined' && typeof Bun.stringWidth === 'function'
|
|
? Bun.stringWidth
|
|
: null
|
|
|
|
const BUN_STRING_WIDTH_OPTS = { ambiguousIsNarrow: true } as const
|
|
|
|
export const stringWidth: (str: string) => number = bunStringWidth
|
|
? str => bunStringWidth(str, BUN_STRING_WIDTH_OPTS)
|
|
: stringWidthJavaScript
|