Skip to content

Commit 154cd30

Browse files
committed
codePoint()
1 parent dbca717 commit 154cd30

File tree

4 files changed

+42
-16
lines changed

4 files changed

+42
-16
lines changed

dot-parse/README.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -164,16 +164,12 @@ Parser<String> singleCharEscaped = Parser.chars(1)
164164
The same technique can be used to handle Unicode escaping:
165165

166166
```java {.good}
167-
import static com.google.common.labs.parse.CharacterSet;
168-
import static com.google.common.labs.parse.Parser.chars;
167+
import static com.google.common.labs.parse.Parser.codePoint;
169168
import static com.google.common.labs.parse.Parser.string;
170169

171-
CharacterSet hexDigit = ChraracterSet.charsIn("[0-9a-fA-F]");
172170
Parser<String> unicodeEscaped = string("u")
173-
.then(chars(4)) // 4 chars after \u
174-
.suchThat(hexDigit::matchesAllOf, "4 hex digits")
175-
// parse 4 hex digits to a code point integer; then convert to string
176-
.map(hex -> Character.toString(Integer.parseInt(hex, 16)));
171+
.then(codePoint())
172+
.map(Character::toString);
177173
```
178174

179175
Combine the `singleCharEscaped` and `unicodeEscaped` parsers created above,

dot-parse/src/main/java/com/google/common/labs/parse/CharacterSet.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ private CharacterSet(String string, CharPredicate predicate) {
4343
* @param characterSet A regex-like character set string (e.g. {@code "[a-zA-Z0-9-_]"}),
4444
* but disallows backslash so doesn't support escaping.
4545
* If your character set includes special characters like literal backslash
46-
* or right bracket, use {@link CharPredicate} instead.
46+
* or right bracket, use {@link CharPredicate} directly.
4747
* @throws IllegalArgumentException if {@code characterSet} includes backslash
4848
* or the right bracket (except the outmost pairs of {@code []}).
4949
*/

dot-parse/src/main/java/com/google/common/labs/parse/Parser.java

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,8 @@ public static Parser<String> string(String value) {
212212
*
213213
* <pre>{@code
214214
* Parser<String> unicodeEscaped = string("u")
215-
* .then(chars(4))
216-
* .suchThat(charsIn("[0-9A-Fa-f]")::matchesAllOf, "4 hex digits")
217-
* .map(digits -> Character.toString(Integer.parseInt(digits, 16)));
215+
* .then(codePoint())
216+
* .map(Character::toString);
218217
* quotedStringWithEscapes('"', unicodeEscaped.or(chars(1))).parse("foo\\uD83D");
219218
* }</pre>
220219
*
@@ -231,6 +230,34 @@ public static Parser<String> quotedStringWithEscapes(
231230
.immediatelyBetween(quoteString, quoteString);
232231
}
233232

233+
/**
234+
* Parses a 4-digit hex code point. For example:
235+
*
236+
* <pre>{@code
237+
* codePoint()
238+
* .map(Character::toString)
239+
* .zeroOrMore(Collectors.joining())
240+
* .parse("D83DDE00");
241+
* }</pre>
242+
*
243+
* will return the emoji {@code 😀}.
244+
*
245+
* <p>You can also use it together with {@link #quotedStringWithEscapes}:
246+
*
247+
* <pre>{@code
248+
* quotedStringWithEscapes('"', string("u").then(codePoint()).map(Character::toString));
249+
* }</pre>
250+
*
251+
* @since 9.4
252+
*/
253+
public static Parser<Integer> codePoint() {
254+
return chars(4)
255+
.suchThat(
256+
CharPredicate.range('0', '9').orRange('A', 'F').orRange('a', 'f')::matchesAllOf,
257+
"4-digit hex code point")
258+
.map(digits -> Integer.parseInt(digits, 16));
259+
}
260+
234261
/**
235262
* Sequentially matches {@code left} then {@code right}, and then combines the results using the
236263
* {@code combiner} function.

dot-parse/src/test/java/com/google/common/labs/parse/ParserTest.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import static com.google.common.labs.parse.CharacterSet.charsIn;
44
import static com.google.common.labs.parse.Parser.anyOf;
55
import static com.google.common.labs.parse.Parser.chars;
6+
import static com.google.common.labs.parse.Parser.codePoint;
67
import static com.google.common.labs.parse.Parser.consecutive;
78
import static com.google.common.labs.parse.Parser.digits;
89
import static com.google.common.labs.parse.Parser.literally;
@@ -2507,16 +2508,18 @@ public void quotedStringWithEscapes_invalidQuoteChar_throws() {
25072508

25082509
@Test
25092510
public void quotedStringWithEscapes_unicodeEscape_success() {
2510-
Parser<String> unicodeEscaped =
2511-
string("u")
2512-
.then(chars(4))
2513-
.suchThat(charsIn("[0-9A-Fa-f]")::matchesAllOf, "4 hex digits")
2514-
.map(digits -> Character.toString(Integer.parseInt(digits, 16)));
2511+
Parser<String> unicodeEscaped = string("u").then(codePoint()).map(Character::toString);
25152512
Parser<String> quotedString = Parser.quotedStringWithEscapes('\'', unicodeEscaped.or(chars(1)));
25162513
assertThat(quotedString.parse("''")).isEmpty();
25172514
assertThat(quotedString.parse("'emoji: \\uD83D\\uDE00'")).isEqualTo("emoji: 😀");
25182515
}
25192516

2517+
@Test
2518+
public void codePoint_emoji() {
2519+
assertThat(codePoint().map(Character::toString).zeroOrMore(joining()).parse("d83dDE00"))
2520+
.isEqualTo("😀");
2521+
}
2522+
25202523
@Test
25212524
public void consecutive_success() {
25222525
Parser<String> parser = digits();

0 commit comments

Comments
 (0)