diff --git a/twinkle-screen/src/main/java/org/codejive/twinkle/screen/io/BufferWriter.java b/twinkle-screen/src/main/java/org/codejive/twinkle/screen/io/BufferWriter.java index cc7cc03..35daacb 100644 --- a/twinkle-screen/src/main/java/org/codejive/twinkle/screen/io/BufferWriter.java +++ b/twinkle-screen/src/main/java/org/codejive/twinkle/screen/io/BufferWriter.java @@ -7,13 +7,13 @@ import org.codejive.twinkle.ansi.util.AnsiOutputParser.AnsiSequenceHandler; import org.codejive.twinkle.screen.Buffer; import org.codejive.twinkle.screen.Buffer.LinkPrintOption; -import org.codejive.twinkle.text.SequenceDecoder; import org.codejive.twinkle.text.Size; +import org.codejive.twinkle.text.UnicodeDecoder; import org.jspecify.annotations.NonNull; public class BufferWriter extends Writer { protected Buffer buffer; - protected SequenceDecoder decoder; + protected UnicodeDecoder decoder; int cursorX; int cursorY; private int savedCursorX; @@ -26,7 +26,7 @@ public class BufferWriter extends Writer { public BufferWriter(@NonNull Buffer buffer) { this.buffer = buffer; - this.decoder = new SequenceDecoder(); + this.decoder = new UnicodeDecoder(); this.cursorX = 0; this.cursorY = 0; this.savedCursorX = 0; @@ -73,7 +73,7 @@ public void flush() { } decoder.finish(); if (decoder.isReady()) { - if (decoder.state() == SequenceDecoder.State.ANSI_ESCAPE_SEQUENCE) { + if (decoder.state() == UnicodeDecoder.ANSI) { handleAnsiSequence(decoder.toString()); } else if (decoder.codepoint() == '\n') { cursorX = 0; diff --git a/twinkle-text/src/main/java/org/codejive/twinkle/ansi/util/AnsiDecoder.java b/twinkle-text/src/main/java/org/codejive/twinkle/ansi/util/AnsiDecoder.java new file mode 100644 index 0000000..d0f0925 --- /dev/null +++ b/twinkle-text/src/main/java/org/codejive/twinkle/ansi/util/AnsiDecoder.java @@ -0,0 +1,246 @@ +package org.codejive.twinkle.ansi.util; + +import org.codejive.twinkle.ansi.Constants; + +/** + * A base decoder for handling ANSI escape sequences. This class provides the foundation for + * decoding character sequences, with a focus on identifying and parsing ANSI escape sequences. + * Subclasses can extend this to add additional sequence handling. + * + *

Characters are pushed into the decoder while its state is INCOMPLETE. Once enough information + * is available to determine the sequence type, the state changes accordingly. Use {@code reset()} + * to clear the decoder and start a new sequence. + */ +public class AnsiDecoder { + // State constants + public static final int INCOMPLETE = 0; + public static final int ANSI = 1; + public static final int ERROR = 2; + + protected enum AnsiMode { + NONE, + PREFIX, + CSI, + OSC + } + + protected final StringBuilder buffer = new StringBuilder(); + protected int state = INCOMPLETE; + protected AnsiMode ansiMode = AnsiMode.NONE; + protected boolean oscSeenEsc = false; + + /** + * Pushes a character value (as an int) into the decoder. + * + *

Accepts int values to support full Unicode range including supplementary characters. This + * base implementation handles ANSI escape sequences. Subclasses should override {@link + * #handleNonAnsi(int)} to provide additional handling for non-ANSI characters. + * + * @param c the character value to push + */ + public void push(int c) { + if (!canPush(c)) { + state = ERROR; + return; + } + + if (Character.isSupplementaryCodePoint(c)) { + buffer.append(Character.toChars(c)); + } else { + buffer.append((char) c); + } + + if (ansiMode != AnsiMode.NONE) { + char[] chars = Character.toChars(c); + for (int i = 0; i < chars.length; i++) { + pushAnsi(chars[i]); + if (state == ERROR || state == ANSI) { + break; + } + } + return; + } + + if (c == Constants.ESC) { + pushAnsi((char) c); + return; + } + + handleNonAnsi(c); + } + + /** + * Returns true if the given character value can be consumed as part of the currently decoded + * sequence. + * + *

This is a non-mutating probe. Callers can use it to detect sequence boundaries without + * relying on completion heuristics. + * + * @param c the character value to check + * @return true if the character can be pushed + */ + public boolean canPush(int c) { + if (state == ERROR || state == ANSI) { + return false; + } + + if (!Character.isValidCodePoint(c)) { + return false; + } + + if (ansiMode != AnsiMode.NONE) { + return true; + } + + if (buffer.length() == 0) { + return true; + } + + if (c == Constants.ESC) { + return false; + } + + return canPushNonAnsi(c); + } + + /** + * Finalizes pending state when no more input is available. + * + *

This base implementation resolves unterminated ANSI escapes as ANSI sequences. Subclasses + * should override {@link #finishNonAnsi()} to handle additional finalization logic. + */ + public void finish() { + if (state == ERROR) { + return; + } + if (ansiMode != AnsiMode.NONE) { + ansiMode = AnsiMode.NONE; + state = ANSI; + return; + } + finishNonAnsi(); + } + + /** Resets the decoder to its initial state, clearing all accumulated data. */ + public void reset() { + buffer.setLength(0); + state = INCOMPLETE; + ansiMode = AnsiMode.NONE; + oscSeenEsc = false; + resetNonAnsi(); + } + + /** Returns true if the decoder has completed a sequence. */ + public boolean isComplete() { + return state() != INCOMPLETE; + } + + /** + * Returns the current state of the decoder. + * + * @return the current state as an int constant + */ + public int state() { + return state; + } + + /** Returns the buffered sequence as a string. */ + @Override + public String toString() { + return buffer.toString(); + } + + /** Handles ANSI escape sequence parsing logic. */ + protected void pushAnsi(char ch) { + if (state == ANSI) { + state = ERROR; + return; + } + + if (ansiMode == AnsiMode.NONE) { + if (ch == Constants.ESC) { + ansiMode = AnsiMode.PREFIX; + state = INCOMPLETE; + return; + } + state = ERROR; + return; + } + + if (ansiMode == AnsiMode.PREFIX) { + if (ch == '[') { + ansiMode = AnsiMode.CSI; + state = INCOMPLETE; + } else if (ch == ']') { + ansiMode = AnsiMode.OSC; + state = INCOMPLETE; + oscSeenEsc = false; + } else { + state = ANSI; + ansiMode = AnsiMode.NONE; + } + return; + } + + if (ansiMode == AnsiMode.CSI) { + if (ch >= 0x40 && ch <= 0x7E) { + state = ANSI; + ansiMode = AnsiMode.NONE; + } else { + state = INCOMPLETE; + } + return; + } + + if (ansiMode == AnsiMode.OSC) { + if (oscSeenEsc) { + if (ch == '\\') { + state = ANSI; + ansiMode = AnsiMode.NONE; + oscSeenEsc = false; + return; + } + oscSeenEsc = (ch == Constants.ESC); + state = INCOMPLETE; + return; + } + if (ch == 0x07) { + state = ANSI; + ansiMode = AnsiMode.NONE; + return; + } + oscSeenEsc = (ch == Constants.ESC); + state = INCOMPLETE; + } + } + + /** + * Hook for subclasses to handle non-ANSI characters. Base implementation sets state to ERROR. + * + * @param c the character value to handle + */ + protected void handleNonAnsi(int c) { + state = ERROR; + } + + /** + * Hook for subclasses to check if a non-ANSI character can be pushed. Base implementation + * returns false. + * + * @param c the character value to check + * @return true if the character can be pushed + */ + protected boolean canPushNonAnsi(int c) { + return false; + } + + /** Hook for subclasses to perform finalization of non-ANSI sequences. */ + protected void finishNonAnsi() { + // Base implementation does nothing + } + + /** Hook for subclasses to reset non-ANSI state. */ + protected void resetNonAnsi() { + // Base implementation does nothing + } +} diff --git a/twinkle-text/src/main/java/org/codejive/twinkle/text/SequenceIterator.java b/twinkle-text/src/main/java/org/codejive/twinkle/text/SequenceIterator.java index 6c495c8..6b4f103 100644 --- a/twinkle-text/src/main/java/org/codejive/twinkle/text/SequenceIterator.java +++ b/twinkle-text/src/main/java/org/codejive/twinkle/text/SequenceIterator.java @@ -39,12 +39,6 @@ public interface SequenceIterator { /** Returns the full sequence of the last returned codepoint from {@link #next()}. */ String sequence(); - /** Returns the start index of the current sequence in characters. */ - int begin(); - - /** Returns the end index of the current sequence in characters. */ - int end(); - static SequenceIterator of(CharSequence text) { return new CharSequenceSequenceIterator(text); } @@ -104,7 +98,7 @@ protected int calculateWidth(int cp) { class CharSequenceSequenceIterator extends BaseSequenceIterator { private final CharSequence text; private final int length; - private final SequenceDecoder decoder = new SequenceDecoder(); + private final UnicodeDecoder decoder = new UnicodeDecoder(); private int cursor = 0; private int sequenceStart = 0; @@ -144,16 +138,6 @@ public String sequence() { return text.subSequence(sequenceStart, sequenceEnd).toString(); } - @Override - public int begin() { - return sequenceStart; - } - - @Override - public int end() { - return sequenceEnd; - } - private void primeNext() { if (cursor >= length) { nextLeadCodePoint = -1; @@ -184,12 +168,12 @@ private void primeNext() { cursor += cpChars; } - if (cursor >= length || decoder.state() == SequenceDecoder.State.INCOMPLETE) { + if (cursor >= length || decoder.state() == UnicodeDecoder.INCOMPLETE) { decoder.finish(); } sequenceEnd = cursor; - if (decoder.state() == SequenceDecoder.State.ANSI_ESCAPE_SEQUENCE) { + if (decoder.state() == UnicodeDecoder.ANSI) { nextLeadCodePoint = Constants.ESC; currentWidth = 0; } else { @@ -212,12 +196,10 @@ private void primeNext() { class ReaderSequenceIterator extends BaseSequenceIterator { private final PushbackReader reader; private final StringBuilder currentSequence = new StringBuilder(); - private final SequenceDecoder decoder = new SequenceDecoder(); + private final UnicodeDecoder decoder = new UnicodeDecoder(); private int nextLeadCodePoint = -1; private boolean primed = false; private boolean exhausted = false; - private int position = 0; - private int sequenceStart = 0; /** Creates a SequenceIterator that reads from the given Reader. */ ReaderSequenceIterator(Reader reader) { @@ -251,21 +233,8 @@ public String sequence() { return currentSequence.toString(); } - /** Returns the start index of the current sequence in characters. */ - @Override - public int begin() { - return sequenceStart; - } - - /** Returns the end index of the current sequence in characters. */ - @Override - public int end() { - return sequenceStart + currentSequence.length(); - } - private void primeNext() { currentSequence.setLength(0); - sequenceStart = position; nextLeadCodePoint = -1; try { @@ -299,12 +268,12 @@ private void primeNext() { } if (currentSequence.length() > 0 - && (cp == -1 || decoder.state() == SequenceDecoder.State.INCOMPLETE)) { + && (cp == -1 || decoder.state() == UnicodeDecoder.INCOMPLETE)) { decoder.finish(); } if (currentSequence.length() > 0 && nextLeadCodePoint == -1) { - if (decoder.state() == SequenceDecoder.State.ANSI_ESCAPE_SEQUENCE) { + if (decoder.state() == UnicodeDecoder.ANSI) { nextLeadCodePoint = Constants.ESC; currentWidth = 0; } else { @@ -340,15 +309,10 @@ private void unreadCodePoint(int cp) throws IOException { } private int read() throws IOException { - int c = reader.read(); - if (c != -1) { - position++; - } - return c; + return reader.read(); } private void unread(int c) throws IOException { reader.unread(c); - position--; } } diff --git a/twinkle-text/src/main/java/org/codejive/twinkle/text/StyledIterator.java b/twinkle-text/src/main/java/org/codejive/twinkle/text/StyledIterator.java index a69f333..d5d3198 100644 --- a/twinkle-text/src/main/java/org/codejive/twinkle/text/StyledIterator.java +++ b/twinkle-text/src/main/java/org/codejive/twinkle/text/StyledIterator.java @@ -71,16 +71,6 @@ public int width() { return delegate.width(); } - @Override - public int begin() { - return delegate.begin(); - } - - @Override - public int end() { - return delegate.end(); - } - @Override public String sequence() { return delegate.sequence(); diff --git a/twinkle-text/src/main/java/org/codejive/twinkle/text/SequenceDecoder.java b/twinkle-text/src/main/java/org/codejive/twinkle/text/UnicodeDecoder.java similarity index 54% rename from twinkle-text/src/main/java/org/codejive/twinkle/text/SequenceDecoder.java rename to twinkle-text/src/main/java/org/codejive/twinkle/text/UnicodeDecoder.java index 34851dc..c26fcdf 100644 --- a/twinkle-text/src/main/java/org/codejive/twinkle/text/SequenceDecoder.java +++ b/twinkle-text/src/main/java/org/codejive/twinkle/text/UnicodeDecoder.java @@ -1,40 +1,28 @@ package org.codejive.twinkle.text; import org.codejive.twinkle.ansi.Constants; +import org.codejive.twinkle.ansi.util.AnsiDecoder; /** - * A utility class for decoding sequences of characters. Sequences can be built up by pushing - * characters into the decoder while it's state is INCOMPLETE. Once the decoder has enough - * information to determine the type of sequence (e.g. a codepoint, a grapheme cluster, or an ANSI - * escape sequence), the state will change to the corresponding type. The decoder can then be - * queried for the result. reset() can be used to clear the decoder and start building - * a new sequence. + * A decoder for character sequences including Unicode codepoints, grapheme clusters, and ANSI + * escape sequences. Extends {@link AnsiDecoder} to add support for proper Unicode handling + * including surrogate pairs, grapheme cluster boundaries, and extended grapheme cluster rules. + * + *

Sequences can be built up by pushing characters into the decoder while its state is + * INCOMPLETE. Once the decoder has enough information to determine the type of sequence (e.g. a + * codepoint, a grapheme cluster, or an ANSI escape sequence), the state will change to the + * corresponding type. The decoder can then be queried for the result. reset() can be + * used to clear the decoder and start building a new sequence. */ -public class SequenceDecoder { - public enum State { - INCOMPLETE, - CODEPOINT, - GRAPHEME_CLUSTER, - ANSI_ESCAPE_SEQUENCE, - ERROR - } - - private enum AnsiMode { - NONE, - PREFIX, - CSI, - OSC - } +public class UnicodeDecoder extends AnsiDecoder { + // Additional state constants for Unicode handling + public static final int CODEPOINT = 10; + public static final int GRAPHEME_CLUSTER = 11; private static final int NEWLINE = '\n'; private static final int CARRIAGE_RETURN = '\r'; - private final StringBuilder buffer = new StringBuilder(); - - private State state = State.INCOMPLETE; - private AnsiMode ansiMode = AnsiMode.NONE; private char pendingHighSurrogate = 0; - private boolean oscSeenEsc = false; private int riCount = 0; private boolean pendingCarriageReturn = false; private int firstCodepoint = -1; @@ -44,196 +32,119 @@ private enum AnsiMode { /** * Pushes either a Unicode code point or a UTF-16 code unit encoded as an int. * - *

Values in the surrogate range are treated as UTF-16 code units and paired using internal - * pending-surrogate state. + *

Extends the base implementation to handle UTF-16 surrogate pairs and Unicode codepoint + * sequences properly. */ + @Override public void push(int cp) { + // Handle UTF-16 surrogate pairs if (pendingHighSurrogate != 0) { if (!canPush(cp)) { - state = State.ERROR; + state = ERROR; return; } char low = (char) cp; buffer.append(low); int codepoint = Character.toCodePoint(pendingHighSurrogate, low); pendingHighSurrogate = 0; - pushCodepoint(codepoint); + handleNonAnsi(codepoint); return; } if (cp >= Character.MIN_HIGH_SURROGATE && cp <= Character.MAX_HIGH_SURROGATE) { if (!canPush(cp)) { - state = State.ERROR; + state = ERROR; return; } buffer.append((char) cp); pendingHighSurrogate = (char) cp; - state = State.INCOMPLETE; - return; - } - - if (!canPush(cp)) { - state = State.ERROR; - return; - } - - if (Character.isSupplementaryCodePoint(cp)) { - buffer.append(Character.toChars(cp)); - } else { - buffer.append((char) cp); - } - - if (pendingCarriageReturn) { - pendingCarriageReturn = false; - state = State.CODEPOINT; - return; - } - - if (ansiMode != AnsiMode.NONE) { - char[] chars = Character.toChars(cp); - for (int i = 0; i < chars.length; i++) { - pushAnsi(chars[i]); - if (state == State.ERROR || state == State.ANSI_ESCAPE_SEQUENCE) { - break; - } - } - return; - } - - if (cp == Constants.ESC) { - pushAnsi((char) cp); + state = INCOMPLETE; return; } - if (cp == CARRIAGE_RETURN) { - firstCodepoint = NEWLINE; - lastCodepoint = NEWLINE; - codepointCount = 1; - state = State.INCOMPLETE; - pendingCarriageReturn = true; - return; - } - - if (cp == NEWLINE) { - firstCodepoint = NEWLINE; - lastCodepoint = NEWLINE; - codepointCount = 1; - state = State.CODEPOINT; - return; - } - - pushCodepoint(cp); + // Delegate to base class for ANSI handling and standard processing + super.push(cp); } /** * Returns true if {@code cp} can be consumed as part of the currently decoded sequence. * - *

This is a non-mutating probe. Callers can use it to detect sequence boundaries without - * relying on completion heuristics. - * - *

Like {@link #push(int)}, this accepts either Unicode code points or UTF-16 code units - * encoded as ints. + *

Extends the base implementation to handle UTF-16 surrogate pairs and grapheme cluster + * boundaries. */ + @Override public boolean canPush(int cp) { - if (state == State.ERROR || state == State.ANSI_ESCAPE_SEQUENCE) { + if (state == ERROR || state == ANSI) { return false; } + // Handle UTF-16 surrogate pairs if (pendingHighSurrogate != 0) { if (cp < Character.MIN_LOW_SURROGATE || cp > Character.MAX_LOW_SURROGATE) { return false; } int codepoint = Character.toCodePoint(pendingHighSurrogate, (char) cp); - return canPushCodepoint(codepoint); + return canPushNonAnsi(codepoint); } if (cp >= Character.MIN_HIGH_SURROGATE && cp <= Character.MAX_HIGH_SURROGATE) { if (ansiMode != AnsiMode.NONE || buffer.length() == 0) { return true; } - return canPushCodepoint(0x10000); + return canPushNonAnsi(0x10000); } if (cp >= Character.MIN_LOW_SURROGATE && cp <= Character.MAX_LOW_SURROGATE) { return false; } - if (!Character.isValidCodePoint(cp)) { - return false; - } - - if (pendingCarriageReturn) { - return cp == NEWLINE; - } - - if (ansiMode != AnsiMode.NONE) { - return true; - } - - if (buffer.length() == 0) { - return true; - } - - if (cp == Constants.ESC) { - return false; - } - - return canPushCodepoint(cp); + // Delegate to base class + return super.canPush(cp); } /** * Finalizes pending state when no more input is available. * - *

This resolves incomplete CR line endings as newline sequences and resolves unterminated - * ANSI escapes as ANSI sequences, matching iterator semantics at end of input. + *

Extends the base implementation to resolve incomplete CR line endings and validate + * surrogate pairs. */ + @Override public void finish() { - if (state == State.ERROR) { + if (state == ERROR) { return; } if (pendingHighSurrogate != 0) { - state = State.ERROR; + state = ERROR; return; } if (pendingCarriageReturn) { pendingCarriageReturn = false; - state = State.CODEPOINT; + state = CODEPOINT; return; } - if (ansiMode != AnsiMode.NONE) { - ansiMode = AnsiMode.NONE; - state = State.ANSI_ESCAPE_SEQUENCE; - } + // Delegate to base class for ANSI finalization + super.finish(); } + /** Resets the decoder to its initial state, clearing all accumulated data. */ + @Override public void reset() { - buffer.setLength(0); - state = State.INCOMPLETE; - ansiMode = AnsiMode.NONE; + super.reset(); pendingHighSurrogate = 0; pendingCarriageReturn = false; - oscSeenEsc = false; - riCount = 0; - firstCodepoint = -1; - lastCodepoint = -1; - codepointCount = 0; - } - - public boolean isComplete() { - return state() != State.INCOMPLETE; } /** - * Returns true when the current sequence can be emitted as-is. + * Returns true if the decoder has completed a sequence and can be emitted as-is. * *

Unlike {@link #isComplete()}, this reports false for tails that are syntactically * extendable and usually require continuation (for example trailing ZWJ/virama/prepend). */ public boolean isReady() { - if (state == State.ERROR || state == State.INCOMPLETE) { + if (state == ERROR || state == INCOMPLETE) { return false; } - if (state == State.ANSI_ESCAPE_SEQUENCE) { + if (state == ANSI) { return true; } if (pendingHighSurrogate != 0 || pendingCarriageReturn || codepointCount == 0) { @@ -245,10 +156,6 @@ public boolean isReady() { || Unicode.isPrepend(lastCodepoint)); } - public State state() { - return state; - } - /** * Returns the lead code point for the decoded sequence. * @@ -256,10 +163,10 @@ public State state() { * "\r"} or {@code "\r\n"}. Returns -1 while incomplete or in error. */ public int codepoint() { - if (!isComplete() || state == State.ERROR) { + if (!isComplete() || state == ERROR) { return -1; } - if (state == State.ANSI_ESCAPE_SEQUENCE) { + if (state == ANSI) { return Constants.ESC; } return codepointCount == 0 ? -1 : firstCodepoint; @@ -274,86 +181,70 @@ public int codepoint() { * @return the visual column width, or -1 while incomplete or in error state */ public int width() { - if (!isComplete() || state == State.ERROR) { + if (!isComplete() || state == ERROR) { return -1; } - if (state == State.ANSI_ESCAPE_SEQUENCE) { + if (state == ANSI) { return 0; } return codepointCount == 0 ? -1 : calculateWidth(firstCodepoint); } + /** + * Implements the hook for handling non-ANSI characters. This method handles Unicode codepoints + * including newlines and grapheme cluster formation. + */ @Override - public String toString() { - return buffer.toString(); - } - - private void pushAnsi(char ch) { - if (state == State.ANSI_ESCAPE_SEQUENCE) { - state = State.ERROR; + protected void handleNonAnsi(int cp) { + if (pendingCarriageReturn) { + pendingCarriageReturn = false; + state = CODEPOINT; return; } - if (ansiMode == AnsiMode.NONE) { - if (ch == Constants.ESC) { - ansiMode = AnsiMode.PREFIX; - state = State.INCOMPLETE; - return; - } - state = State.ERROR; + if (cp == CARRIAGE_RETURN) { + firstCodepoint = NEWLINE; + lastCodepoint = NEWLINE; + codepointCount = 1; + state = INCOMPLETE; + pendingCarriageReturn = true; return; } - if (ansiMode == AnsiMode.PREFIX) { - if (ch == '[') { - ansiMode = AnsiMode.CSI; - state = State.INCOMPLETE; - } else if (ch == ']') { - ansiMode = AnsiMode.OSC; - state = State.INCOMPLETE; - oscSeenEsc = false; - } else { - state = State.ANSI_ESCAPE_SEQUENCE; - ansiMode = AnsiMode.NONE; - } + if (cp == NEWLINE) { + firstCodepoint = NEWLINE; + lastCodepoint = NEWLINE; + codepointCount = 1; + state = CODEPOINT; return; } - if (ansiMode == AnsiMode.CSI) { - if (ch >= 0x40 && ch <= 0x7E) { - state = State.ANSI_ESCAPE_SEQUENCE; - ansiMode = AnsiMode.NONE; - } else { - state = State.INCOMPLETE; - } - return; - } + pushCodepoint(cp); + } - if (ansiMode == AnsiMode.OSC) { - if (oscSeenEsc) { - if (ch == '\\') { - state = State.ANSI_ESCAPE_SEQUENCE; - ansiMode = AnsiMode.NONE; - oscSeenEsc = false; - return; - } - oscSeenEsc = (ch == Constants.ESC); - state = State.INCOMPLETE; - return; - } - if (ch == 0x07) { - state = State.ANSI_ESCAPE_SEQUENCE; - ansiMode = AnsiMode.NONE; - return; - } - oscSeenEsc = (ch == Constants.ESC); - state = State.INCOMPLETE; + /** Implements the hook for checking if a non-ANSI character can be pushed. */ + @Override + protected boolean canPushNonAnsi(int cp) { + if (pendingCarriageReturn) { + return cp == NEWLINE; } + return canPushCodepoint(cp); + } + + /** Implements the hook for resetting Unicode-specific state. */ + @Override + protected void resetNonAnsi() { + pendingHighSurrogate = 0; + pendingCarriageReturn = false; + riCount = 0; + firstCodepoint = -1; + lastCodepoint = -1; + codepointCount = 0; } private void pushCodepoint(int cp) { - if (state == State.ANSI_ESCAPE_SEQUENCE) { - state = State.ERROR; + if (state == ANSI) { + state = ERROR; return; } @@ -362,19 +253,19 @@ private void pushCodepoint(int cp) { lastCodepoint = cp; codepointCount = 1; riCount = Unicode.isRegionalIndicator(cp) ? 1 : 0; - state = State.CODEPOINT; + state = CODEPOINT; return; } if (shouldBreak(lastCodepoint, cp, riCount)) { - state = State.ERROR; + state = ERROR; return; } lastCodepoint = cp; codepointCount++; riCount = Unicode.isRegionalIndicator(cp) ? riCount + 1 : 0; - state = State.GRAPHEME_CLUSTER; + state = GRAPHEME_CLUSTER; } private boolean canPushCodepoint(int cp) { diff --git a/twinkle-text/src/test/java/org/codejive/twinkle/text/TestSequenceDecoder.java b/twinkle-text/src/test/java/org/codejive/twinkle/text/TestSequenceDecoder.java index aee184c..bdb3792 100644 --- a/twinkle-text/src/test/java/org/codejive/twinkle/text/TestSequenceDecoder.java +++ b/twinkle-text/src/test/java/org/codejive/twinkle/text/TestSequenceDecoder.java @@ -8,109 +8,109 @@ public class TestSequenceDecoder { @Test public void testSimpleCodepoint() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); assertThat(decoder.isComplete()).isFalse(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); decoder.push('A'); assertThat(decoder.isComplete()).isTrue(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); assertThat(decoder.width()).isEqualTo(1); assertThat(decoder.toString()).isEqualTo("A"); } @Test public void testSurrogatePair() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); String clef = "\uD834\uDD1E"; decoder.push(clef.charAt(0)); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); decoder.push(clef.charAt(1)); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); assertThat(decoder.toString()).isEqualTo(clef); } @Test public void testCombiningMarkUpgradesToGraphemeCluster() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('a'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); assertThat(decoder.canPush('\u0301')).isTrue(); assertThat(decoder.canPush('b')).isFalse(); decoder.push('\u0301'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.GRAPHEME_CLUSTER); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.GRAPHEME_CLUSTER); assertThat(decoder.width()).isEqualTo(1); assertThat(decoder.toString()).isEqualTo("a\u0301"); } @Test public void testAnsiCsiSequence() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); String csi = "\u001B[31m"; for (int i = 0; i < csi.length() - 1; i++) { decoder.push(csi.charAt(i)); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); } decoder.push(csi.charAt(csi.length() - 1)); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ANSI_ESCAPE_SEQUENCE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ANSI); assertThat(decoder.width()).isEqualTo(0); assertThat(decoder.toString()).isEqualTo(csi); } @Test public void testAnsiOscSequenceWithStTerminator() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); String osc = "\u001B]8;;http://example.com\u001B\\"; for (int i = 0; i < osc.length() - 1; i++) { decoder.push(osc.charAt(i)); } - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); decoder.push(osc.charAt(osc.length() - 1)); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ANSI_ESCAPE_SEQUENCE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ANSI); assertThat(decoder.toString()).isEqualTo(osc); } @Test public void testInvalidSurrogateGoesToError() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('\uD834'); decoder.push('x'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ERROR); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ERROR); assertThat(decoder.width()).isEqualTo(-1); } @Test public void testResetClearsState() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('A'); assertThat(decoder.isComplete()).isTrue(); decoder.reset(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); assertThat(decoder.width()).isEqualTo(-1); assertThat(decoder.toString()).isEmpty(); } @Test public void testCanPushDetectsBoundaryCharacters() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); assertThat(decoder.canPush('x')).isTrue(); decoder.push('x'); @@ -121,12 +121,12 @@ public void testCanPushDetectsBoundaryCharacters() { assertThat(decoder.canPush('\u001B')).isFalse(); // Probe does not mutate state. - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); } @Test public void testCanPushDuringAnsiAndAfterCompletion() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); assertThat(decoder.canPush('\u001B')).isTrue(); decoder.push('\u001B'); @@ -139,13 +139,13 @@ public void testCanPushDuringAnsiAndAfterCompletion() { assertThat(decoder.canPush('m')).isTrue(); decoder.push('m'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ANSI_ESCAPE_SEQUENCE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ANSI); assertThat(decoder.canPush('x')).isFalse(); } @Test public void testFamilySequenceReadinessAndCanPushBoundaries() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); String man = "\uD83D\uDC68"; String woman = "\uD83D\uDC69"; @@ -181,27 +181,27 @@ public void testFamilySequenceReadinessAndCanPushBoundaries() { pushStringAssertingCanPush(decoder, boy); assertThat(decoder.isReady()).isTrue(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.GRAPHEME_CLUSTER); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.GRAPHEME_CLUSTER); assertThat(decoder.toString()) .isEqualTo(man + joiner + woman + joiner + girl + joiner + boy); } @Test public void testPushCodepointOverloadWithSupplementary() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); int man = 0x1F468; assertThat(decoder.canPush(man)).isTrue(); decoder.push(man); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); assertThat(decoder.toString()).isEqualTo(new String(Character.toChars(man))); assertThat(decoder.isReady()).isTrue(); } @Test public void testCanPushCodepointOverloadForFamilyJoin() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push(0x1F468); assertThat(decoder.canPush(0x1F469)).isFalse(); @@ -212,12 +212,12 @@ public void testCanPushCodepointOverloadForFamilyJoin() { assertThat(decoder.canPush(0x1F469)).isTrue(); decoder.push(0x1F469); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.GRAPHEME_CLUSTER); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.GRAPHEME_CLUSTER); } @Test public void testInvalidCodepointOverloadInput() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); assertThat(decoder.canPush(-1)).isFalse(); assertThat(decoder.canPush(0x110000)).isFalse(); @@ -225,10 +225,10 @@ public void testInvalidCodepointOverloadInput() { assertThat(decoder.canPush(0xDC00)).isFalse(); decoder.push(0x110000); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ERROR); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ERROR); } - private static void pushStringAssertingCanPush(SequenceDecoder decoder, String value) { + private static void pushStringAssertingCanPush(UnicodeDecoder decoder, String value) { for (int i = 0; i < value.length(); i++) { char ch = value.charAt(i); assertThat(decoder.canPush(ch)).isTrue(); @@ -238,15 +238,15 @@ private static void pushStringAssertingCanPush(SequenceDecoder decoder, String v @Test public void testFinishLoneCrAtEofYieldsNewline() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('\r'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); assertThat(decoder.isReady()).isFalse(); decoder.finish(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); assertThat(decoder.toString()).isEqualTo("\r"); assertThat(decoder.codepoint()).isEqualTo('\n'); assertThat(decoder.width()).isEqualTo(0); @@ -254,104 +254,104 @@ public void testFinishLoneCrAtEofYieldsNewline() { @Test public void testFinishAfterCrLfHasNoEffect() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); // CR+LF is already complete — finish() should leave state unchanged. decoder.push('\r'); decoder.push('\n'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); decoder.finish(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); assertThat(decoder.toString()).isEqualTo("\r\n"); assertThat(decoder.codepoint()).isEqualTo('\n'); } @Test public void testFinishUnterminatedCsiYieldsAnsiSequence() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); // Push ESC [ 3 1 — missing final byte 'm'. decoder.push('\u001B'); decoder.push('['); decoder.push('3'); decoder.push('1'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); decoder.finish(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ANSI_ESCAPE_SEQUENCE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ANSI); assertThat(decoder.toString()).isEqualTo("\u001B[31"); assertThat(decoder.width()).isEqualTo(0); } @Test public void testFinishUnterminatedOscYieldsAnsiSequence() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); // Push ESC ] 0 ; T i t l e — no BEL or ST terminator. String osc = "\u001B]0;Title"; for (int i = 0; i < osc.length(); i++) { decoder.push(osc.charAt(i)); } - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); decoder.finish(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ANSI_ESCAPE_SEQUENCE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ANSI); assertThat(decoder.toString()).isEqualTo(osc); assertThat(decoder.width()).isEqualTo(0); } @Test public void testFinishPendingHighSurrogateYieldsError() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('\uD834'); // high surrogate, low surrogate never arrives - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); decoder.finish(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ERROR); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ERROR); } @Test public void testFinishOnCompleteStateIsNoOp() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('A'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); decoder.finish(); // Already complete — finish() must not change state or codepoints. - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); assertThat(decoder.toString()).isEqualTo("A"); } @Test public void testFinishOnErrorStateIsNoOp() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push(0x110000); // invalid code point → ERROR - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ERROR); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ERROR); decoder.finish(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.ERROR); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.ERROR); } @Test public void testFinishOnEmptyDecoderIsNoOp() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); decoder.finish(); // Nothing was pushed — finish() on an empty decoder should leave it INCOMPLETE. - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.INCOMPLETE); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.INCOMPLETE); } // ------------------------------------------------------------------------- @@ -363,14 +363,14 @@ public void testFinishOnEmptyDecoderIsNoOp() { public void testVS16StaysAttachedToBase() { // ☎ + VS16: the variation selector must NOT cause a break — the decoder // should yield a single GRAPHEME_CLUSTER, not split into two CODEPOINTs. - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('\u260E'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.CODEPOINT); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.CODEPOINT); assertThat(decoder.canPush('\uFE0F')).isTrue(); decoder.push('\uFE0F'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.GRAPHEME_CLUSTER); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.GRAPHEME_CLUSTER); assertThat(decoder.toString()).isEqualTo("\u260E\uFE0F"); // Width should be wide because of VS16 assertThat(Unicode.isWide(decoder.toString())).isTrue(); @@ -379,13 +379,13 @@ public void testVS16StaysAttachedToBase() { @Test public void testVS15StaysAttachedToBase() { // ☎ + VS15: variation selector must not cause a break either - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('\u260E'); assertThat(decoder.canPush('\uFE0E')).isTrue(); decoder.push('\uFE0E'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.GRAPHEME_CLUSTER); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.GRAPHEME_CLUSTER); assertThat(decoder.toString()).isEqualTo("\u260E\uFE0E"); // Width should be narrow because of VS15 assertThat(Unicode.isWide(decoder.toString())).isFalse(); @@ -394,7 +394,7 @@ public void testVS15StaysAttachedToBase() { @Test public void testVS16DoesNotAttachToSecondCodepoint() { // After a complete codepoint, a VS on a *different* base must not attach to the first - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('A'); assertThat(decoder.canPush('\uFE0F')).isFalse(); } @@ -405,14 +405,14 @@ public void testVS16DoesNotAttachToSecondCodepoint() { @Test public void testTwoRegionalIndicatorsFormSingleCluster() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); pushString(decoder, new String(Character.toChars(0x1F1FA))); // 🇺 assertThat(decoder.isReady()).isTrue(); assertThat(decoder.canPush(0x1F1F8)).isTrue(); // 🇸 can attach decoder.push(0x1F1F8); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.GRAPHEME_CLUSTER); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.GRAPHEME_CLUSTER); assertThat(decoder.isReady()).isTrue(); // A third regional indicator must NOT attach (would start a new flag) assertThat(decoder.canPush(0x1F1FA)).isFalse(); @@ -422,7 +422,7 @@ public void testTwoRegionalIndicatorsFormSingleCluster() { @Test public void testThirdRegionalIndicatorDoesNotAttach() { // Verify riCount logic: after 2 RI, a 3rd must break - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push(0x1F1FA); decoder.push(0x1F1F8); assertThat(decoder.canPush(0x1F1FA)).isFalse(); @@ -434,7 +434,7 @@ public void testThirdRegionalIndicatorDoesNotAttach() { @Test public void testZwjPreventsBreak() { - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push(0x1F468); // 👨 assertThat(decoder.canPush(Unicode.ZWJ)).isTrue(); decoder.push(Unicode.ZWJ); @@ -443,7 +443,7 @@ public void testZwjPreventsBreak() { assertThat(decoder.canPush(0x1F469)).isTrue(); // 👩 can follow decoder.push(0x1F469); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.GRAPHEME_CLUSTER); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.GRAPHEME_CLUSTER); assertThat(decoder.isReady()).isTrue(); } @@ -454,12 +454,12 @@ public void testZwjPreventsBreak() { @Test public void testNonSpacingMarkStaysAttached() { // Combining grave accent (U+0300) is NON_SPACING_MARK — must not break - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('a'); assertThat(decoder.canPush('\u0300')).isTrue(); decoder.push('\u0300'); - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.GRAPHEME_CLUSTER); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.GRAPHEME_CLUSTER); assertThat(decoder.toString()).isEqualTo("a\u0300"); assertThat(Unicode.isWide(decoder.toString())).isFalse(); } @@ -467,12 +467,12 @@ public void testNonSpacingMarkStaysAttached() { @Test public void testCombiningSpacingMarkStaysAttached() { // Devanagari vowel sign AA (U+093E) is COMBINING_SPACING_MARK - SequenceDecoder decoder = new SequenceDecoder(); + UnicodeDecoder decoder = new UnicodeDecoder(); decoder.push('\u0915'); // क assertThat(decoder.canPush('\u093E')).isTrue(); decoder.push('\u093E'); // ा - assertThat(decoder.state()).isEqualTo(SequenceDecoder.State.GRAPHEME_CLUSTER); + assertThat(decoder.state()).isEqualTo(UnicodeDecoder.GRAPHEME_CLUSTER); assertThat(decoder.toString()).isEqualTo("\u0915\u093E"); } @@ -480,7 +480,7 @@ public void testCombiningSpacingMarkStaysAttached() { // Helper // ------------------------------------------------------------------------- - private static void pushString(SequenceDecoder decoder, String s) { + private static void pushString(UnicodeDecoder decoder, String s) { for (int i = 0; i < s.length(); i++) { decoder.push(s.charAt(i)); }