001 /*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.base;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021
022 import com.google.common.annotations.Beta;
023 import com.google.common.annotations.GwtCompatible;
024
025 import java.util.Arrays;
026 import javax.annotation.CheckReturnValue;
027
028 /**
029 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
030 * for any {@link Object}. Also offers basic text processing methods based on this function.
031 * Implementations are strongly encouraged to be side-effect-free and immutable.
032 *
033 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
034 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
035 *
036 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
037 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
038 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
039 * treats these just as two separate characters.
040 *
041 * <p>Example usages: <pre>
042 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
043 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
044 *
045 * <p>See the Guava User Guide article on <a href=
046 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
047 * {@code CharMatcher}</a>.
048 *
049 * @author Kevin Bourrillion
050 * @since 1.0
051 */
052 @Beta // Possibly change from chars to code points; decide constants vs. methods
053 @GwtCompatible
054 public abstract class CharMatcher implements Predicate<Character> {
055 // Constants
056 /**
057 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
058 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
059 * discussion of that term.
060 *
061 * @since 2.0
062 */
063 public static final CharMatcher BREAKING_WHITESPACE =
064 anyOf("\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000")
065 .or(inRange('\u2000', '\u2006'))
066 .or(inRange('\u2008', '\u200a'))
067 .withToString("CharMatcher.BREAKING_WHITESPACE")
068 .precomputed();
069
070 /**
071 * Determines whether a character is ASCII, meaning that its code point is less than 128.
072 */
073 public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
074
075 /**
076 * Determines whether a character is a digit according to
077 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
078 */
079 public static final CharMatcher DIGIT;
080
081 static {
082 CharMatcher digit = inRange('0', '9');
083 String zeroes =
084 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
085 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
086 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
087 for (char base : zeroes.toCharArray()) {
088 digit = digit.or(inRange(base, (char) (base + 9)));
089 }
090 DIGIT = digit.withToString("CharMatcher.DIGIT").precomputed();
091 }
092
093 /**
094 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
095 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
096 */
097 public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
098 @Override
099 public boolean matches(char c) {
100 return Character.isDigit(c);
101 }
102 };
103
104 /**
105 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
106 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
107 * inRange('a', 'z').or(inRange('A', 'Z'))}.
108 */
109 public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
110 @Override
111 public boolean matches(char c) {
112 return Character.isLetter(c);
113 }
114
115 @Override
116 public CharMatcher precomputed() {
117 return this;
118 }
119 };
120
121 /**
122 * Determines whether a character is a letter or digit according to {@link
123 * Character#isLetterOrDigit(char) Java's definition}.
124 */
125 public static final CharMatcher JAVA_LETTER_OR_DIGIT =
126 new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
127 @Override
128 public boolean matches(char c) {
129 return Character.isLetterOrDigit(c);
130 }
131 };
132
133 /**
134 * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
135 * Java's definition}.
136 */
137 public static final CharMatcher JAVA_UPPER_CASE =
138 new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
139 @Override
140 public boolean matches(char c) {
141 return Character.isUpperCase(c);
142 }
143 };
144
145 /**
146 * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
147 * Java's definition}.
148 */
149 public static final CharMatcher JAVA_LOWER_CASE =
150 new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
151 @Override
152 public boolean matches(char c) {
153 return Character.isLowerCase(c);
154 }
155 };
156
157 /**
158 * Determines whether a character is an ISO control character as specified by {@link
159 * Character#isISOControl(char)}.
160 */
161 public static final CharMatcher JAVA_ISO_CONTROL =
162 inRange('\u0000', '\u001f')
163 .or(inRange('\u007f', '\u009f'))
164 .withToString("CharMatcher.JAVA_ISO_CONTROL");
165
166 /**
167 * Determines whether a character is invisible; that is, if its Unicode category is any of
168 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
169 * PRIVATE_USE according to ICU4J.
170 */
171 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
172 .or(inRange('\u007f', '\u00a0'))
173 .or(is('\u00ad'))
174 .or(inRange('\u0600', '\u0604'))
175 .or(anyOf("\u06dd\u070f\u1680\u180e"))
176 .or(inRange('\u2000', '\u200f'))
177 .or(inRange('\u2028', '\u202f'))
178 .or(inRange('\u205f', '\u2064'))
179 .or(inRange('\u206a', '\u206f'))
180 .or(is('\u3000'))
181 .or(inRange('\ud800', '\uf8ff'))
182 .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
183 .withToString("CharMatcher.INVISIBLE")
184 .precomputed();
185
186 /**
187 * Determines whether a character is single-width (not double-width). When in doubt, this matcher
188 * errs on the side of returning {@code false} (that is, it tends to assume a character is
189 * double-width).
190 *
191 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
192 * date.
193 */
194 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
195 .or(is('\u05be'))
196 .or(inRange('\u05d0', '\u05ea'))
197 .or(is('\u05f3'))
198 .or(is('\u05f4'))
199 .or(inRange('\u0600', '\u06ff'))
200 .or(inRange('\u0750', '\u077f'))
201 .or(inRange('\u0e00', '\u0e7f'))
202 .or(inRange('\u1e00', '\u20af'))
203 .or(inRange('\u2100', '\u213a'))
204 .or(inRange('\ufb50', '\ufdff'))
205 .or(inRange('\ufe70', '\ufeff'))
206 .or(inRange('\uff61', '\uffdc'))
207 .withToString("CharMatcher.SINGLE_WIDTH")
208 .precomputed();
209
210 /** Matches any character. */
211 public static final CharMatcher ANY =
212 new CharMatcher("CharMatcher.ANY") {
213 @Override
214 public boolean matches(char c) {
215 return true;
216 }
217
218
219 @Override
220 public int indexIn(CharSequence sequence) {
221 return (sequence.length() == 0) ? -1 : 0;
222 }
223
224
225 @Override
226 public int indexIn(CharSequence sequence, int start) {
227 int length = sequence.length();
228 Preconditions.checkPositionIndex(start, length);
229 return (start == length) ? -1 : start;
230 }
231
232
233 @Override
234 public int lastIndexIn(CharSequence sequence) {
235 return sequence.length() - 1;
236 }
237
238
239 @Override
240 public boolean matchesAllOf(CharSequence sequence) {
241 checkNotNull(sequence);
242 return true;
243 }
244
245
246 @Override
247 public boolean matchesNoneOf(CharSequence sequence) {
248 return sequence.length() == 0;
249 }
250
251
252 @Override
253 public String removeFrom(CharSequence sequence) {
254 checkNotNull(sequence);
255 return "";
256 }
257
258
259 @Override
260 public String replaceFrom(CharSequence sequence, char replacement) {
261 char[] array = new char[sequence.length()];
262 Arrays.fill(array, replacement);
263 return new String(array);
264 }
265
266
267 @Override
268 public String replaceFrom(CharSequence sequence, CharSequence replacement) {
269 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
270 for (int i = 0; i < sequence.length(); i++) {
271 retval.append(replacement);
272 }
273 return retval.toString();
274 }
275
276
277 @Override
278 public String collapseFrom(CharSequence sequence, char replacement) {
279 return (sequence.length() == 0) ? "" : String.valueOf(replacement);
280 }
281
282
283 @Override
284 public String trimFrom(CharSequence sequence) {
285 checkNotNull(sequence);
286 return "";
287 }
288
289
290 @Override
291 public int countIn(CharSequence sequence) {
292 return sequence.length();
293 }
294
295
296 @Override
297 public CharMatcher and(CharMatcher other) {
298 return checkNotNull(other);
299 }
300
301
302 @Override
303 public CharMatcher or(CharMatcher other) {
304 checkNotNull(other);
305 return this;
306 }
307
308
309 @Override
310 public CharMatcher negate() {
311 return NONE;
312 }
313
314
315 @Override
316 public CharMatcher precomputed() {
317 return this;
318 }
319 };
320
321 /** Matches no characters. */
322 public static final CharMatcher NONE =
323 new CharMatcher("CharMatcher.NONE") {
324 @Override
325 public boolean matches(char c) {
326 return false;
327 }
328
329
330 @Override
331 public int indexIn(CharSequence sequence) {
332 checkNotNull(sequence);
333 return -1;
334 }
335
336
337 @Override
338 public int indexIn(CharSequence sequence, int start) {
339 int length = sequence.length();
340 Preconditions.checkPositionIndex(start, length);
341 return -1;
342 }
343
344
345 @Override
346 public int lastIndexIn(CharSequence sequence) {
347 checkNotNull(sequence);
348 return -1;
349 }
350
351
352 @Override
353 public boolean matchesAllOf(CharSequence sequence) {
354 return sequence.length() == 0;
355 }
356
357
358 @Override
359 public boolean matchesNoneOf(CharSequence sequence) {
360 checkNotNull(sequence);
361 return true;
362 }
363
364
365 @Override
366 public String removeFrom(CharSequence sequence) {
367 return sequence.toString();
368 }
369
370
371 @Override
372 public String replaceFrom(CharSequence sequence, char replacement) {
373 return sequence.toString();
374 }
375
376
377 @Override
378 public String replaceFrom(CharSequence sequence, CharSequence replacement) {
379 checkNotNull(replacement);
380 return sequence.toString();
381 }
382
383
384 @Override
385 public String collapseFrom(CharSequence sequence, char replacement) {
386 return sequence.toString();
387 }
388
389
390 @Override
391 public String trimFrom(CharSequence sequence) {
392 return sequence.toString();
393 }
394
395
396 @Override
397 public int countIn(CharSequence sequence) {
398 checkNotNull(sequence);
399 return 0;
400 }
401
402
403 @Override
404 public CharMatcher and(CharMatcher other) {
405 checkNotNull(other);
406 return this;
407 }
408
409
410 @Override
411 public CharMatcher or(CharMatcher other) {
412 return checkNotNull(other);
413 }
414
415
416 @Override
417 public CharMatcher negate() {
418 return ANY;
419 }
420
421
422 @Override
423 void setBits(LookupTable table) {}
424
425
426 @Override
427 public CharMatcher precomputed() {
428 return this;
429 }
430 };
431
432 // Static factories
433
434 /**
435 * Returns a {@code char} matcher that matches only one specified character.
436 */
437 public static CharMatcher is(final char match) {
438 String description = new StringBuilder("CharMatcher.is(")
439 .append(Integer.toHexString(match))
440 .append(")")
441 .toString();
442 return new CharMatcher(description) {
443 @Override
444 public boolean matches(char c) {
445 return c == match;
446 }
447
448
449 @Override
450 public String replaceFrom(CharSequence sequence, char replacement) {
451 return sequence.toString().replace(match, replacement);
452 }
453
454
455 @Override
456 public CharMatcher and(CharMatcher other) {
457 return other.matches(match) ? this : NONE;
458 }
459
460
461 @Override
462 public CharMatcher or(CharMatcher other) {
463 return other.matches(match) ? other : super.or(other);
464 }
465
466
467 @Override
468 public CharMatcher negate() {
469 return isNot(match);
470 }
471
472
473 @Override
474 void setBits(LookupTable table) {
475 table.set(match);
476 }
477
478
479 @Override
480 public CharMatcher precomputed() {
481 return this;
482 }
483 };
484 }
485
486 /**
487 * Returns a {@code char} matcher that matches any character except the one specified.
488 *
489 * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
490 */
491 public static CharMatcher isNot(final char match) {
492 String description = new StringBuilder("CharMatcher.isNot(")
493 .append(Integer.toHexString(match))
494 .append(")")
495 .toString();
496 return new CharMatcher(description) {
497 @Override
498 public boolean matches(char c) {
499 return c != match;
500 }
501
502
503 @Override
504 public CharMatcher and(CharMatcher other) {
505 return other.matches(match) ? super.and(other) : other;
506 }
507
508
509 @Override
510 public CharMatcher or(CharMatcher other) {
511 return other.matches(match) ? ANY : this;
512 }
513
514
515 @Override
516 public CharMatcher negate() {
517 return is(match);
518 }
519 };
520 }
521
522 /**
523 * Returns a {@code char} matcher that matches any character present in the given character
524 * sequence.
525 */
526 public static CharMatcher anyOf(final CharSequence sequence) {
527 switch (sequence.length()) {
528 case 0:
529 return NONE;
530 case 1:
531 return is(sequence.charAt(0));
532 case 2:
533 final char match1 = sequence.charAt(0);
534 final char match2 = sequence.charAt(1);
535 return new CharMatcher(
536 new StringBuilder("CharMatcher.anyOf(\"").append(sequence).append("\")").toString()) {
537 @Override
538 public boolean matches(char c) {
539 return c == match1 || c == match2;
540 }
541
542
543 @Override
544 void setBits(LookupTable table) {
545 table.set(match1);
546 table.set(match2);
547 }
548
549
550 @Override
551 public CharMatcher precomputed() {
552 return this;
553 }
554 };
555 }
556 final char[] chars = sequence.toString().toCharArray();
557 Arrays.sort(chars);
558
559 return new CharMatcher(new StringBuilder("CharMatcher.anyOf(\"").append(chars)
560 .append("\")").toString()) {
561 @Override
562 public boolean matches(char c) {
563 return Arrays.binarySearch(chars, c) >= 0;
564 }
565 };
566 }
567
568 /**
569 * Returns a {@code char} matcher that matches any character not present in the given character
570 * sequence.
571 */
572 public static CharMatcher noneOf(CharSequence sequence) {
573 return anyOf(sequence).negate();
574 }
575
576 /**
577 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
578 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
579 * CharMatcher.inRange('a', 'z')}.
580 *
581 * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
582 */
583 public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
584 checkArgument(endInclusive >= startInclusive);
585 String description = new StringBuilder("CharMatcher.inRange(")
586 .append(Integer.toHexString(startInclusive))
587 .append(", ")
588 .append(Integer.toHexString(endInclusive))
589 .append(")")
590 .toString();
591 return inRange(startInclusive, endInclusive, description);
592 }
593
594 static CharMatcher inRange(final char startInclusive, final char endInclusive,
595 String description) {
596 return new CharMatcher(description) {
597 @Override
598 public boolean matches(char c) {
599 return startInclusive <= c && c <= endInclusive;
600 }
601
602
603 @Override
604 void setBits(LookupTable table) {
605 char c = startInclusive;
606 while (true) {
607 table.set(c);
608 if (c++ == endInclusive) {
609 break;
610 }
611 }
612 }
613
614
615 @Override
616 public CharMatcher precomputed() {
617 return this;
618 }
619 };
620 }
621
622 /**
623 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
624 * which operates on primitive {@code char} instances instead.
625 */
626 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
627 checkNotNull(predicate);
628 if (predicate instanceof CharMatcher) {
629 return (CharMatcher) predicate;
630 }
631 String description = new StringBuilder("CharMatcher.forPredicate(")
632 .append(predicate)
633 .append(')')
634 .toString();
635 return new CharMatcher(description) {
636 @Override
637 public boolean matches(char c) {
638 return predicate.apply(c);
639 }
640
641
642 @Override
643 public boolean apply(Character character) {
644 return predicate.apply(checkNotNull(character));
645 }
646 };
647 }
648
649 // State
650 final String description;
651
652 // Constructors
653
654 /**
655 * Sets the {@code toString()} from the given description.
656 */
657 CharMatcher(String description) {
658 this.description = description;
659 }
660
661 /**
662 * Constructor for use by subclasses. When subclassing, you may want to override
663 * {@code toString()} to provide a useful description.
664 */
665 protected CharMatcher() {
666 description = "UnknownCharMatcher";
667 }
668
669 // Abstract methods
670
671 /** Determines a true or false value for the given character. */
672 public abstract boolean matches(char c);
673
674 // Non-static factories
675
676 /**
677 * Returns a matcher that matches any character not matched by this matcher.
678 */
679 public CharMatcher negate() {
680 final CharMatcher original = this;
681 return new CharMatcher(original + ".negate()") {
682 @Override
683 public boolean matches(char c) {
684 return !original.matches(c);
685 }
686
687
688 @Override
689 public boolean matchesAllOf(CharSequence sequence) {
690 return original.matchesNoneOf(sequence);
691 }
692
693
694 @Override
695 public boolean matchesNoneOf(CharSequence sequence) {
696 return original.matchesAllOf(sequence);
697 }
698
699
700 @Override
701 public int countIn(CharSequence sequence) {
702 return sequence.length() - original.countIn(sequence);
703 }
704
705
706 @Override
707 public CharMatcher negate() {
708 return original;
709 }
710 };
711 }
712
713 /**
714 * Returns a matcher that matches any character matched by both this matcher and {@code other}.
715 */
716 public CharMatcher and(CharMatcher other) {
717 return new And(this, checkNotNull(other));
718 }
719
720 private static class And extends CharMatcher {
721 final CharMatcher first;
722 final CharMatcher second;
723
724 And(CharMatcher a, CharMatcher b) {
725 this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
726 }
727
728 And(CharMatcher a, CharMatcher b, String description) {
729 super(description);
730 first = checkNotNull(a);
731 second = checkNotNull(b);
732 }
733
734
735 @Override
736 public CharMatcher and(CharMatcher other) {
737 return new And(this, other);
738 }
739
740
741 @Override
742 public boolean matches(char c) {
743 return first.matches(c) && second.matches(c);
744 }
745
746
747 @Override
748 CharMatcher withToString(String description) {
749 return new And(first, second, description);
750 }
751 }
752
753 /**
754 * Returns a matcher that matches any character matched by either this matcher or {@code other}.
755 */
756 public CharMatcher or(CharMatcher other) {
757 return new Or(this, checkNotNull(other));
758 }
759
760 private static class Or extends CharMatcher {
761 final CharMatcher first;
762 final CharMatcher second;
763
764 Or(CharMatcher a, CharMatcher b, String description) {
765 super(description);
766 first = checkNotNull(a);
767 second = checkNotNull(b);
768 }
769
770 Or(CharMatcher a, CharMatcher b) {
771 this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
772 }
773
774
775 @Override
776 public CharMatcher or(CharMatcher other) {
777 return new Or(this, checkNotNull(other));
778 }
779
780
781 @Override
782 public boolean matches(char c) {
783 return first.matches(c) || second.matches(c);
784 }
785
786
787 @Override
788 CharMatcher withToString(String description) {
789 return new Or(first, second, description);
790 }
791 }
792
793 /**
794 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
795 * query than the original; your mileage may vary. Precomputation takes time and is likely to be
796 * worthwhile only if the precomputed matcher is queried many thousands of times.
797 *
798 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
799 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
800 * worthwhile tradeoff in a browser.
801 */
802 public CharMatcher precomputed() {
803 return Platform.precomputeCharMatcher(this);
804 }
805
806 /**
807 * Construct an array of all possible chars in the slowest way possible.
808 */
809 char[] slowGetChars() {
810 char[] allChars = new char[65536];
811 int size = 0;
812 for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
813 if (matches((char) c)) {
814 allChars[size++] = (char) c;
815 }
816 }
817 char[] retValue = new char[size];
818 System.arraycopy(allChars, 0, retValue, 0, size);
819 return retValue;
820 }
821
822 /**
823 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
824 * on {@link Platform} so that we can have different behavior in GWT.
825 *
826 * <p>If the number of matched characters is small enough, we try to build a small hash
827 * table to contain all of the characters. Otherwise, we record the characters in eight-kilobyte
828 * bit array. In many situations this produces a matcher which is faster to query
829 * than the original.
830 */
831 CharMatcher precomputedInternal() {
832 final char[] chars = slowGetChars();
833 int totalCharacters = chars.length;
834 if (totalCharacters == 0) {
835 return NONE;
836 } else if (totalCharacters == 1) {
837 return is(chars[0]);
838 } else if (totalCharacters < SmallCharMatcher.MAX_SIZE) {
839 return SmallCharMatcher.from(chars, toString());
840 } else if (totalCharacters < MediumCharMatcher.MAX_SIZE) {
841 return MediumCharMatcher.from(chars, toString());
842 }
843 // Otherwise, make the full lookup table.
844 final LookupTable table = new LookupTable();
845 setBits(table);
846 final CharMatcher outer = this;
847
848 return new CharMatcher(outer.toString()) {
849 @Override
850 public boolean matches(char c) {
851 return table.get(c);
852 }
853
854 // TODO(kevinb): make methods like negate() smart?
855
856
857 @Override
858 public CharMatcher precomputed() {
859 return this;
860 }
861 };
862 }
863
864 /**
865 * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
866 * but with their {@code toString} method overridden with the new description.
867 *
868 * <p>This is unsupported by default.
869 */
870 CharMatcher withToString(String description) {
871 throw new UnsupportedOperationException();
872
873 }
874
875 /**
876 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
877 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
878 *
879 * <p>The default implementation loops over every possible character value, invoking {@link
880 * #matches} for each one.
881 */
882 void setBits(LookupTable table) {
883 char c = Character.MIN_VALUE;
884 while (true) {
885 if (matches(c)) {
886 table.set(c);
887 }
888 if (c++ == Character.MAX_VALUE) {
889 break;
890 }
891 }
892 }
893
894 /**
895 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
896 *
897 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
898 * simpler java.util.BitSet.
899 */
900 private static final class LookupTable {
901 int[] data = new int[2048];
902
903 void set(char index) {
904 data[index >> 5] |= (1 << index);
905 }
906
907 boolean get(char index) {
908 return (data[index >> 5] & (1 << index)) != 0;
909 }
910 }
911
912 // Text processing routines
913
914 /**
915 * Returns {@code true} if a character sequence contains at least one matching character.
916 * Equivalent to {@code !matchesNoneOf(sequence)}.
917 *
918 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
919 * character, until this returns {@code true} or the end is reached.
920 *
921 * @param sequence the character sequence to examine, possibly empty
922 * @return {@code true} if this matcher matches at least one character in the sequence
923 * @since 8.0
924 */
925 public boolean matchesAnyOf(CharSequence sequence) {
926 return !matchesNoneOf(sequence);
927 }
928
929 /**
930 * Returns {@code true} if a character sequence contains only matching characters.
931 *
932 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
933 * character, until this returns {@code false} or the end is reached.
934 *
935 * @param sequence the character sequence to examine, possibly empty
936 * @return {@code true} if this matcher matches every character in the sequence, including when
937 * the sequence is empty
938 */
939 public boolean matchesAllOf(CharSequence sequence) {
940 for (int i = sequence.length() - 1; i >= 0; i--) {
941 if (!matches(sequence.charAt(i))) {
942 return false;
943 }
944 }
945 return true;
946 }
947
948 /**
949 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
950 * {@code !matchesAnyOf(sequence)}.
951 *
952 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
953 * character, until this returns {@code false} or the end is reached.
954 *
955 * @param sequence the character sequence to examine, possibly empty
956 * @return {@code true} if this matcher matches every character in the sequence, including when
957 * the sequence is empty
958 */
959 public boolean matchesNoneOf(CharSequence sequence) {
960 return indexIn(sequence) == -1;
961 }
962
963 /**
964 * Returns the index of the first matching character in a character sequence, or {@code -1} if no
965 * matching character is present.
966 *
967 * <p>The default implementation iterates over the sequence in forward order calling {@link
968 * #matches} for each character.
969 *
970 * @param sequence the character sequence to examine from the beginning
971 * @return an index, or {@code -1} if no character matches
972 */
973 public int indexIn(CharSequence sequence) {
974 int length = sequence.length();
975 for (int i = 0; i < length; i++) {
976 if (matches(sequence.charAt(i))) {
977 return i;
978 }
979 }
980 return -1;
981 }
982
983 /**
984 * Returns the index of the first matching character in a character sequence, starting from a
985 * given position, or {@code -1} if no character matches after that position.
986 *
987 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
988 * start}, calling {@link #matches} for each character.
989 *
990 * @param sequence the character sequence to examine
991 * @param start the first index to examine; must be nonnegative and no greater than {@code
992 * sequence.length()}
993 * @return the index of the first matching character, guaranteed to be no less than {@code start},
994 * or {@code -1} if no character matches
995 * @throws IndexOutOfBoundsException if start is negative or greater than {@code
996 * sequence.length()}
997 */
998 public int indexIn(CharSequence sequence, int start) {
999 int length = sequence.length();
1000 Preconditions.checkPositionIndex(start, length);
1001 for (int i = start; i < length; i++) {
1002 if (matches(sequence.charAt(i))) {
1003 return i;
1004 }
1005 }
1006 return -1;
1007 }
1008
1009 /**
1010 * Returns the index of the last matching character in a character sequence, or {@code -1} if no
1011 * matching character is present.
1012 *
1013 * <p>The default implementation iterates over the sequence in reverse order calling {@link
1014 * #matches} for each character.
1015 *
1016 * @param sequence the character sequence to examine from the end
1017 * @return an index, or {@code -1} if no character matches
1018 */
1019 public int lastIndexIn(CharSequence sequence) {
1020 for (int i = sequence.length() - 1; i >= 0; i--) {
1021 if (matches(sequence.charAt(i))) {
1022 return i;
1023 }
1024 }
1025 return -1;
1026 }
1027
1028 /**
1029 * Returns the number of matching characters found in a character sequence.
1030 */
1031 public int countIn(CharSequence sequence) {
1032 int count = 0;
1033 for (int i = 0; i < sequence.length(); i++) {
1034 if (matches(sequence.charAt(i))) {
1035 count++;
1036 }
1037 }
1038 return count;
1039 }
1040
1041 /**
1042 * Returns a string containing all non-matching characters of a character sequence, in order. For
1043 * example: <pre> {@code
1044 *
1045 * CharMatcher.is('a').removeFrom("bazaar")}</pre>
1046 *
1047 * ... returns {@code "bzr"}.
1048 */
1049 @CheckReturnValue
1050 public String removeFrom(CharSequence sequence) {
1051 String string = sequence.toString();
1052 int pos = indexIn(string);
1053 if (pos == -1) {
1054 return string;
1055 }
1056
1057 char[] chars = string.toCharArray();
1058 int spread = 1;
1059
1060 // This unusual loop comes from extensive benchmarking
1061 OUT: while (true) {
1062 pos++;
1063 while (true) {
1064 if (pos == chars.length) {
1065 break OUT;
1066 }
1067 if (matches(chars[pos])) {
1068 break;
1069 }
1070 chars[pos - spread] = chars[pos];
1071 pos++;
1072 }
1073 spread++;
1074 }
1075 return new String(chars, 0, pos - spread);
1076 }
1077
1078 /**
1079 * Returns a string containing all matching characters of a character sequence, in order. For
1080 * example: <pre> {@code
1081 *
1082 * CharMatcher.is('a').retainFrom("bazaar")}</pre>
1083 *
1084 * ... returns {@code "aaa"}.
1085 */
1086 @CheckReturnValue
1087 public String retainFrom(CharSequence sequence) {
1088 return negate().removeFrom(sequence);
1089 }
1090
1091 /**
1092 * Returns a string copy of the input character sequence, with each character that matches this
1093 * matcher replaced by a given replacement character. For example: <pre> {@code
1094 *
1095 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
1096 *
1097 * ... returns {@code "rodor"}.
1098 *
1099 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1100 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1101 * character.
1102 *
1103 * @param sequence the character sequence to replace matching characters in
1104 * @param replacement the character to append to the result string in place of each matching
1105 * character in {@code sequence}
1106 * @return the new string
1107 */
1108 @CheckReturnValue
1109 public String replaceFrom(CharSequence sequence, char replacement) {
1110 String string = sequence.toString();
1111 int pos = indexIn(string);
1112 if (pos == -1) {
1113 return string;
1114 }
1115 char[] chars = string.toCharArray();
1116 chars[pos] = replacement;
1117 for (int i = pos + 1; i < chars.length; i++) {
1118 if (matches(chars[i])) {
1119 chars[i] = replacement;
1120 }
1121 }
1122 return new String(chars);
1123 }
1124
1125 /**
1126 * Returns a string copy of the input character sequence, with each character that matches this
1127 * matcher replaced by a given replacement sequence. For example: <pre> {@code
1128 *
1129 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1130 *
1131 * ... returns {@code "yoohoo"}.
1132 *
1133 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1134 * off calling {@link #replaceFrom(CharSequence, char)} directly.
1135 *
1136 * @param sequence the character sequence to replace matching characters in
1137 * @param replacement the characters to append to the result string in place of each matching
1138 * character in {@code sequence}
1139 * @return the new string
1140 */
1141 @CheckReturnValue
1142 public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1143 int replacementLen = replacement.length();
1144 if (replacementLen == 0) {
1145 return removeFrom(sequence);
1146 }
1147 if (replacementLen == 1) {
1148 return replaceFrom(sequence, replacement.charAt(0));
1149 }
1150
1151 String string = sequence.toString();
1152 int pos = indexIn(string);
1153 if (pos == -1) {
1154 return string;
1155 }
1156
1157 int len = string.length();
1158 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1159
1160 int oldpos = 0;
1161 do {
1162 buf.append(string, oldpos, pos);
1163 buf.append(replacement);
1164 oldpos = pos + 1;
1165 pos = indexIn(string, oldpos);
1166 } while (pos != -1);
1167
1168 buf.append(string, oldpos, len);
1169 return buf.toString();
1170 }
1171
1172 /**
1173 * Returns a substring of the input character sequence that omits all characters this matcher
1174 * matches from the beginning and from the end of the string. For example: <pre> {@code
1175 *
1176 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1177 *
1178 * ... returns {@code "cat"}.
1179 *
1180 * <p>Note that: <pre> {@code
1181 *
1182 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1183 *
1184 * ... is equivalent to {@link String#trim()}.
1185 */
1186 @CheckReturnValue
1187 public String trimFrom(CharSequence sequence) {
1188 int len = sequence.length();
1189 int first;
1190 int last;
1191
1192 for (first = 0; first < len; first++) {
1193 if (!matches(sequence.charAt(first))) {
1194 break;
1195 }
1196 }
1197 for (last = len - 1; last > first; last--) {
1198 if (!matches(sequence.charAt(last))) {
1199 break;
1200 }
1201 }
1202
1203 return sequence.subSequence(first, last + 1).toString();
1204 }
1205
1206 /**
1207 * Returns a substring of the input character sequence that omits all characters this matcher
1208 * matches from the beginning of the string. For example: <pre> {@code
1209 *
1210 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1211 *
1212 * ... returns {@code "catbab"}.
1213 */
1214 @CheckReturnValue
1215 public String trimLeadingFrom(CharSequence sequence) {
1216 int len = sequence.length();
1217 int first;
1218
1219 for (first = 0; first < len; first++) {
1220 if (!matches(sequence.charAt(first))) {
1221 break;
1222 }
1223 }
1224
1225 return sequence.subSequence(first, len).toString();
1226 }
1227
1228 /**
1229 * Returns a substring of the input character sequence that omits all characters this matcher
1230 * matches from the end of the string. For example: <pre> {@code
1231 *
1232 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1233 *
1234 * ... returns {@code "abacat"}.
1235 */
1236 @CheckReturnValue
1237 public String trimTrailingFrom(CharSequence sequence) {
1238 int len = sequence.length();
1239 int last;
1240
1241 for (last = len - 1; last >= 0; last--) {
1242 if (!matches(sequence.charAt(last))) {
1243 break;
1244 }
1245 }
1246
1247 return sequence.subSequence(0, last + 1).toString();
1248 }
1249
1250 /**
1251 * Returns a string copy of the input character sequence, with each group of consecutive
1252 * characters that match this matcher replaced by a single replacement character. For example:
1253 * <pre> {@code
1254 *
1255 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1256 *
1257 * ... returns {@code "b-p-r"}.
1258 *
1259 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1260 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1261 * character.
1262 *
1263 * @param sequence the character sequence to replace matching groups of characters in
1264 * @param replacement the character to append to the result string in place of each group of
1265 * matching characters in {@code sequence}
1266 * @return the new string
1267 */
1268 @CheckReturnValue
1269 public String collapseFrom(CharSequence sequence, char replacement) {
1270 int first = indexIn(sequence);
1271 if (first == -1) {
1272 return sequence.toString();
1273 }
1274
1275 // TODO(kevinb): see if this implementation can be made faster
1276 StringBuilder builder = new StringBuilder(sequence.length())
1277 .append(sequence.subSequence(0, first))
1278 .append(replacement);
1279 boolean in = true;
1280 for (int i = first + 1; i < sequence.length(); i++) {
1281 char c = sequence.charAt(i);
1282 if (matches(c)) {
1283 if (!in) {
1284 builder.append(replacement);
1285 in = true;
1286 }
1287 } else {
1288 builder.append(c);
1289 in = false;
1290 }
1291 }
1292 return builder.toString();
1293 }
1294
1295 /**
1296 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1297 * groups of matching characters at the start or end of the sequence are removed without
1298 * replacement.
1299 */
1300 @CheckReturnValue
1301 public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1302 int first = negate().indexIn(sequence);
1303 if (first == -1) {
1304 return ""; // everything matches. nothing's left.
1305 }
1306 StringBuilder builder = new StringBuilder(sequence.length());
1307 boolean inMatchingGroup = false;
1308 for (int i = first; i < sequence.length(); i++) {
1309 char c = sequence.charAt(i);
1310 if (matches(c)) {
1311 inMatchingGroup = true;
1312 } else {
1313 if (inMatchingGroup) {
1314 builder.append(replacement);
1315 inMatchingGroup = false;
1316 }
1317 builder.append(c);
1318 }
1319 }
1320 return builder.toString();
1321 }
1322
1323 // Predicate interface
1324
1325 /**
1326 * Returns {@code true} if this matcher matches the given character.
1327 *
1328 * @throws NullPointerException if {@code character} is null
1329 */
1330 public boolean apply(Character character) {
1331 return matches(character);
1332 }
1333
1334 /**
1335 * Returns a string representation of this {@code CharMatcher}, such as
1336 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1337 */
1338
1339 @Override
1340 public String toString() {
1341 return description;
1342 }
1343
1344 /**
1345 * Determines whether a character is whitespace according to the latest Unicode standard, as
1346 * illustrated
1347 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1348 * This is not the same definition used by other Java APIs. (See a
1349 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1350 * definitions of "whitespace"</a>.)
1351 *
1352 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1353 * to date.
1354 */
1355 public static final CharMatcher WHITESPACE = new CharMatcher("CharMatcher.WHITESPACE") {
1356 /**
1357 * A special-case CharMatcher for Unicode whitespace characters that is extremely
1358 * efficient both in space required and in time to check for matches.
1359 *
1360 * Implementation details.
1361 * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1362 * so we can construct a lookup table of exactly 79 entries, and just check the character code
1363 * mod 79, and see if that character is in the table.
1364 *
1365 * There is a 1 at the beginning of the table so that the null character is not listed
1366 * as whitespace.
1367 *
1368 * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1369 *
1370 * * Binary search into the sorted list of characters, i.e., what
1371 * CharMatcher.anyOf() does</li>
1372 * * Perfect hash function into a table of size 26 (using an offset table and a special
1373 * Jenkins hash function)</li>
1374 * * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1375 * * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1376 *
1377 * --Christopher Swenson, February 2012.
1378 */
1379
1380 // Mod-79 lookup table.
1381 private final char[] table = {1, 0, 160, 0, 0, 0, 0, 0, 0, 9, 10, 11, 12, 13, 0, 0,
1382 8232, 8233, 0, 0, 0, 0, 0, 8239, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1383 12288, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199,
1384 8200, 8201, 8202, 0, 0, 0, 0, 0, 8287, 5760, 0, 0, 6158, 0, 0, 0};
1385
1386
1387 @Override
1388 public boolean matches(char c) {
1389 return table[c % 79] == c;
1390 }
1391
1392
1393 @Override
1394 public CharMatcher precomputed() {
1395 return this;
1396 }
1397 };
1398 }