diff --git a/scripts/unicode.py b/scripts/unicode.py index 4879e82..9f757eb 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -545,6 +545,12 @@ def load_zero_widths() -> list[bool]: lambda cp: operator.setitem(zw_map, cp, True), ) + # HALFWIDTH KATAKANA VOICED SOUND MARK and HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK + # are `Lm` letters even though they currently carry `Grapheme_Extend`. + # They occupy their own cell in terminal-style width calculations. + zw_map[0xFF9E] = False + zw_map[0xFF9F] = False + # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo` # as zero-width. This matches the behavior of glibc `wcwidth`. # diff --git a/src/gen/tables.rs b/src/gen/tables.rs index c8d3e62..b46ba39 100644 --- a/src/gen/tables.rs +++ b/src/gen/tables.rs @@ -616,7 +616,7 @@ pub(crate) static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32( 0x55, 0x55, ], [ - 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0x6A, 0x55, 0x55, 0x00, 0x00, 0x54, 0x55, ], @@ -1157,7 +1157,7 @@ pub(crate) static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32( ], #[cfg(feature = "cjk")] [ - 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0x6A, 0x55, 0x55, 0x00, 0x00, 0x54, 0x59, ], @@ -1217,7 +1217,7 @@ pub(crate) static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 71] = [ ([0xC0, 0xA9, 0x00], [0xC0, 0xA9, 0x00]), ([0xB0, 0xD7, 0x00], [0xC6, 0xD7, 0x00]), ([0xCB, 0xD7, 0x00], [0xFB, 0xD7, 0x00]), - ([0x9E, 0xFF, 0x00], [0xA0, 0xFF, 0x00]), + ([0xA0, 0xFF, 0x00], [0xA0, 0xFF, 0x00]), ([0xF0, 0xFF, 0x00], [0xF8, 0xFF, 0x00]), ([0xC0, 0x11, 0x01], [0xC0, 0x11, 0x01]), ([0xC2, 0x11, 0x01], [0xC3, 0x11, 0x01]), diff --git a/src/lib.rs b/src/lib.rs index 6d8b38e..40b132a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -98,7 +98,8 @@ //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D) //! with the [`Default_Ignorable_Code_Point`] property. //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D) -//! with the [`Grapheme_Extend`] property. +//! with the [`Grapheme_Extend`] property, except [`'\u{FF9E}'` HALFWIDTH KATAKANA VOICED SOUND MARK](https://util.unicode.org/UnicodeJsps/character.jsp?a=FF9E) +//! and [`'\u{FF9F}'` HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK](https://util.unicode.org/UnicodeJsps/character.jsp?a=FF9F). //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D) //! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`). //! - The following [`Prepended_Concatenation_Mark`]s: diff --git a/tests/tests.rs b/tests/tests.rs index 7ca32a4..0f83c65 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -38,6 +38,11 @@ fn test_emoji() { assert_width!("👩‍🔬", 2, 2); // Woman scientist } +#[test] +fn test_halfwidth_katakana() { + assert_width!("パグ", 4, 4); // Halfwidth Katakana letters Pa, Gu (pug dog) +} + // From README #[test] fn test_bad_devanagari() { @@ -63,6 +68,8 @@ fn test_char2() { assert_width!('\u{1160}', Some(0), Some(0)); assert_width!('\u{a1}', Some(1), Some(2)); assert_width!('\u{300}', Some(0), Some(0)); + assert_width!('\u{FF9E}', Some(1), Some(1)); + assert_width!('\u{FF9F}', Some(1), Some(1)); } #[test]