codePoint()

fluentfuture · fluentfuture · commit 154cd3085267 · 2025-11-05T10:16:29.000-08:00
diff --git a/dot-parse/README.md b/dot-parse/README.md
@@ -164,16 +164,12 @@ Parser<String> singleCharEscaped =  Parser.chars(1)
 The same technique can be used to handle Unicode escaping:
 
 ```java {.good}
-import static com.google.common.labs.parse.CharacterSet;
-import static com.google.common.labs.parse.Parser.chars;
+import static com.google.common.labs.parse.Parser.codePoint;
 import static com.google.common.labs.parse.Parser.string;
 
-CharacterSet hexDigit = ChraracterSet.charsIn("[0-9a-fA-F]");
 Parser<String> unicodeEscaped = string("u")
-    .then(chars(4))  // 4 chars after \u
-    .suchThat(hexDigit::matchesAllOf, "4 hex digits")
-    // parse 4 hex digits to a code point integer; then convert to string
-    .map(hex -> Character.toString(Integer.parseInt(hex, 16)));
+    .then(codePoint())
+    .map(Character::toString);
 ```
 
 Combine the `singleCharEscaped` and `unicodeEscaped` parsers created above,
diff --git a/dot-parse/src/main/java/com/google/common/labs/parse/CharacterSet.java b/dot-parse/src/main/java/com/google/common/labs/parse/CharacterSet.java
@@ -43,7 +43,7 @@ private CharacterSet(String string, CharPredicate predicate) {
    * @param characterSet A regex-like character set string (e.g. {@code "[a-zA-Z0-9-_]"}),
    *        but disallows backslash so doesn't support escaping.
    *        If your character set includes special characters like literal backslash
-   *        or right bracket, use {@link CharPredicate} instead.
+   *        or right bracket, use {@link CharPredicate} directly.
    * @throws IllegalArgumentException if {@code characterSet} includes backslash
    *         or the right bracket (except the outmost pairs of {@code []}).
    */
diff --git a/dot-parse/src/main/java/com/google/common/labs/parse/Parser.java b/dot-parse/src/main/java/com/google/common/labs/parse/Parser.java
@@ -212,9 +212,8 @@ public static Parser<String> string(String value) {
    *
    * <pre>{@code
    * Parser<String> unicodeEscaped = string("u")
-   *     .then(chars(4))
-   *     .suchThat(charsIn("[0-9A-Fa-f]")::matchesAllOf, "4 hex digits")
-   *     .map(digits -> Character.toString(Integer.parseInt(digits, 16)));
+   *     .then(codePoint())
+   *     .map(Character::toString);
    * quotedStringWithEscapes('"', unicodeEscaped.or(chars(1))).parse("foo\\uD83D");
    * }</pre>
    *
@@ -231,6 +230,34 @@ public static Parser<String> quotedStringWithEscapes(
         .immediatelyBetween(quoteString, quoteString);
   }
 
+  /**
+   * Parses a 4-digit hex code point. For example:
+   *
+   * <pre>{@code
+   * codePoint()
+   *     .map(Character::toString)
+   *     .zeroOrMore(Collectors.joining())
+   *     .parse("D83DDE00");
+   * }</pre>
+   *
+   * will return the emoji {@code 😀}.
+   *
+   * <p>You can also use it together with {@link #quotedStringWithEscapes}:
+   *
+   * <pre>{@code
+   * quotedStringWithEscapes('"', string("u").then(codePoint()).map(Character::toString));
+   * }</pre>
+   *
+   * @since 9.4
+   */
+  public static Parser<Integer> codePoint() {
+    return chars(4)
+        .suchThat(
+            CharPredicate.range('0', '9').orRange('A', 'F').orRange('a', 'f')::matchesAllOf,
+            "4-digit hex code point")
+        .map(digits -> Integer.parseInt(digits, 16));
+  }
+
   /**
    * Sequentially matches {@code left} then {@code right}, and then combines the results using the
    * {@code combiner} function.
diff --git a/dot-parse/src/test/java/com/google/common/labs/parse/ParserTest.java b/dot-parse/src/test/java/com/google/common/labs/parse/ParserTest.java
@@ -3,6 +3,7 @@
 import static com.google.common.labs.parse.CharacterSet.charsIn;
 import static com.google.common.labs.parse.Parser.anyOf;
 import static com.google.common.labs.parse.Parser.chars;
+import static com.google.common.labs.parse.Parser.codePoint;
 import static com.google.common.labs.parse.Parser.consecutive;
 import static com.google.common.labs.parse.Parser.digits;
 import static com.google.common.labs.parse.Parser.literally;
@@ -2507,16 +2508,18 @@ public void quotedStringWithEscapes_invalidQuoteChar_throws() {
 
   @Test
   public void quotedStringWithEscapes_unicodeEscape_success() {
-    Parser<String> unicodeEscaped =
-        string("u")
-            .then(chars(4))
-            .suchThat(charsIn("[0-9A-Fa-f]")::matchesAllOf, "4 hex digits")
-            .map(digits -> Character.toString(Integer.parseInt(digits, 16)));
+    Parser<String> unicodeEscaped = string("u").then(codePoint()).map(Character::toString);
     Parser<String> quotedString = Parser.quotedStringWithEscapes('\'', unicodeEscaped.or(chars(1)));
     assertThat(quotedString.parse("''")).isEmpty();
     assertThat(quotedString.parse("'emoji: \\uD83D\\uDE00'")).isEqualTo("emoji: 😀");
   }
 
+  @Test
+  public void codePoint_emoji() {
+    assertThat(codePoint().map(Character::toString).zeroOrMore(joining()).parse("d83dDE00"))
+        .isEqualTo("😀");
+  }
+
   @Test
   public void consecutive_success() {
     Parser<String> parser = digits();