mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-27 07:16:19 +09:00
fix(text): avoid breaking UTF-8 during truncation
Use rune-aware truncation for user-facing text and log previews so multibyte content is not corrupted in memory context, Telegram messages, or diagnostics.
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
package textutil
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// TruncateRunes returns s truncated to at most maxRunes Unicode code points.
|
||||
func TruncateRunes(s string, maxRunes int) string {
|
||||
if maxRunes <= 0 || s == "" {
|
||||
return ""
|
||||
}
|
||||
cut, truncated := byteIndexAfterRunes(s, maxRunes)
|
||||
if !truncated {
|
||||
return s
|
||||
}
|
||||
return s[:cut]
|
||||
}
|
||||
|
||||
// TruncateRunesWithSuffix returns s truncated to at most maxRunes Unicode code
|
||||
// points, appending suffix when truncation occurs.
|
||||
func TruncateRunesWithSuffix(s string, maxRunes int, suffix string) string {
|
||||
if maxRunes <= 0 || s == "" {
|
||||
return ""
|
||||
}
|
||||
if _, truncated := byteIndexAfterRunes(s, maxRunes); !truncated {
|
||||
return s
|
||||
}
|
||||
if suffix == "" {
|
||||
return TruncateRunes(s, maxRunes)
|
||||
}
|
||||
suffixRunes := utf8.RuneCountInString(suffix)
|
||||
if suffixRunes >= maxRunes {
|
||||
return TruncateRunes(s, maxRunes)
|
||||
}
|
||||
cut, truncated := byteIndexAfterRunes(s, maxRunes-suffixRunes)
|
||||
if !truncated {
|
||||
return s
|
||||
}
|
||||
return s[:cut] + suffix
|
||||
}
|
||||
|
||||
func byteIndexAfterRunes(s string, maxRunes int) (int, bool) {
|
||||
if maxRunes <= 0 || s == "" {
|
||||
return 0, len(s) > 0
|
||||
}
|
||||
count := 0
|
||||
for i := range s {
|
||||
if count == maxRunes {
|
||||
return i, true
|
||||
}
|
||||
count++
|
||||
}
|
||||
return len(s), false
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
package textutil
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func TestTruncateRunes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
text := "你好世界"
|
||||
got := TruncateRunes(text, 3)
|
||||
if got != "你好世" {
|
||||
t.Fatalf("TruncateRunes() = %q, want %q", got, "你好世")
|
||||
}
|
||||
if !utf8.ValidString(got) {
|
||||
t.Fatalf("TruncateRunes() returned invalid UTF-8: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTruncateRunesWithSuffix(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
text := strings.Repeat("你", 10) + "abc"
|
||||
got := TruncateRunesWithSuffix(text, 8, "...")
|
||||
if utf8.RuneCountInString(got) != 8 {
|
||||
t.Fatalf("TruncateRunesWithSuffix() rune count = %d, want 8", utf8.RuneCountInString(got))
|
||||
}
|
||||
if got != strings.Repeat("你", 5)+"..." {
|
||||
t.Fatalf("TruncateRunesWithSuffix() = %q", got)
|
||||
}
|
||||
if !utf8.ValidString(got) {
|
||||
t.Fatalf("TruncateRunesWithSuffix() returned invalid UTF-8: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTruncateRunesWithSuffixNoTruncation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
text := "你好世界"
|
||||
if got := TruncateRunesWithSuffix(text, 8, "..."); got != text {
|
||||
t.Fatalf("TruncateRunesWithSuffix() = %q, want %q", got, text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTruncateRunesWithSuffixKeepsInvalidUTF8Bytes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
text := "ab\xffcd"
|
||||
got := TruncateRunesWithSuffix(text, 4, "...")
|
||||
if got != "a..." {
|
||||
t.Fatalf("TruncateRunesWithSuffix() = %q, want %q", got, "a...")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user