001    /*
002     * Copyright (C) 2008 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.base;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    
022    import com.google.common.annotations.Beta;
023    import com.google.common.annotations.GwtCompatible;
024    
025    import java.util.Arrays;
026    import javax.annotation.CheckReturnValue;
027    
028    /**
029     * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
030     * for any {@link Object}. Also offers basic text processing methods based on this function.
031     * Implementations are strongly encouraged to be side-effect-free and immutable.
032     *
033     * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
034     * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
035     *
036     * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
037     * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
038     * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
039     * treats these just as two separate characters.
040     *
041     * <p>Example usages: <pre>
042     *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
043     *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
044     *
045     * <p>See the Guava User Guide article on <a href=
046     * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
047     * {@code CharMatcher}</a>.
048     *
049     * @author Kevin Bourrillion
050     * @since 1.0
051     */
052    @Beta // Possibly change from chars to code points; decide constants vs. methods
053    @GwtCompatible
054    public abstract class CharMatcher implements Predicate<Character> {
055      // Constants
056      /**
057       * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
058       * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
059       * discussion of that term.
060       *
061       * @since 2.0
062       */
063      public static final CharMatcher BREAKING_WHITESPACE =
064          anyOf("\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000")
065              .or(inRange('\u2000', '\u2006'))
066              .or(inRange('\u2008', '\u200a'))
067              .withToString("CharMatcher.BREAKING_WHITESPACE")
068              .precomputed();
069    
070      /**
071       * Determines whether a character is ASCII, meaning that its code point is less than 128.
072       */
073      public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
074    
075      /**
076       * Determines whether a character is a digit according to
077       * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
078       */
079      public static final CharMatcher DIGIT;
080    
081      static {
082        CharMatcher digit = inRange('0', '9');
083        String zeroes =
084            "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
085                + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
086                + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
087        for (char base : zeroes.toCharArray()) {
088          digit = digit.or(inRange(base, (char) (base + 9)));
089        }
090        DIGIT = digit.withToString("CharMatcher.DIGIT").precomputed();
091      }
092    
093      /**
094       * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
095       * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
096       */
097      public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
098        @Override
099        public boolean matches(char c) {
100          return Character.isDigit(c);
101        }
102      };
103    
104      /**
105       * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
106       * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
107       * inRange('a', 'z').or(inRange('A', 'Z'))}.
108       */
109      public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
110        @Override
111        public boolean matches(char c) {
112          return Character.isLetter(c);
113        }
114    
115        @Override
116        public CharMatcher precomputed() {
117          return this;
118        }
119      };
120    
121      /**
122       * Determines whether a character is a letter or digit according to {@link
123       * Character#isLetterOrDigit(char) Java's definition}.
124       */
125      public static final CharMatcher JAVA_LETTER_OR_DIGIT =
126          new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
127        @Override
128        public boolean matches(char c) {
129          return Character.isLetterOrDigit(c);
130        }
131      };
132    
133      /**
134       * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
135       * Java's definition}.
136       */
137      public static final CharMatcher JAVA_UPPER_CASE =
138          new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
139        @Override
140        public boolean matches(char c) {
141          return Character.isUpperCase(c);
142        }
143      };
144    
145      /**
146       * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
147       * Java's definition}.
148       */
149      public static final CharMatcher JAVA_LOWER_CASE =
150          new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
151        @Override
152        public boolean matches(char c) {
153          return Character.isLowerCase(c);
154        }
155      };
156    
157      /**
158       * Determines whether a character is an ISO control character as specified by {@link
159       * Character#isISOControl(char)}.
160       */
161      public static final CharMatcher JAVA_ISO_CONTROL =
162          inRange('\u0000', '\u001f')
163          .or(inRange('\u007f', '\u009f'))
164          .withToString("CharMatcher.JAVA_ISO_CONTROL");
165    
166      /**
167       * Determines whether a character is invisible; that is, if its Unicode category is any of
168       * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
169       * PRIVATE_USE according to ICU4J.
170       */
171      public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
172          .or(inRange('\u007f', '\u00a0'))
173          .or(is('\u00ad'))
174          .or(inRange('\u0600', '\u0604'))
175          .or(anyOf("\u06dd\u070f\u1680\u180e"))
176          .or(inRange('\u2000', '\u200f'))
177          .or(inRange('\u2028', '\u202f'))
178          .or(inRange('\u205f', '\u2064'))
179          .or(inRange('\u206a', '\u206f'))
180          .or(is('\u3000'))
181          .or(inRange('\ud800', '\uf8ff'))
182          .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
183          .withToString("CharMatcher.INVISIBLE")
184          .precomputed();
185    
186      /**
187       * Determines whether a character is single-width (not double-width). When in doubt, this matcher
188       * errs on the side of returning {@code false} (that is, it tends to assume a character is
189       * double-width).
190       *
191       * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
192       * date.
193       */
194      public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
195          .or(is('\u05be'))
196          .or(inRange('\u05d0', '\u05ea'))
197          .or(is('\u05f3'))
198          .or(is('\u05f4'))
199          .or(inRange('\u0600', '\u06ff'))
200          .or(inRange('\u0750', '\u077f'))
201          .or(inRange('\u0e00', '\u0e7f'))
202          .or(inRange('\u1e00', '\u20af'))
203          .or(inRange('\u2100', '\u213a'))
204          .or(inRange('\ufb50', '\ufdff'))
205          .or(inRange('\ufe70', '\ufeff'))
206          .or(inRange('\uff61', '\uffdc'))
207          .withToString("CharMatcher.SINGLE_WIDTH")
208          .precomputed();
209    
210      /** Matches any character. */
211      public static final CharMatcher ANY =
212          new CharMatcher("CharMatcher.ANY") {
213            @Override
214            public boolean matches(char c) {
215              return true;
216            }
217    
218            
219            @Override
220            public int indexIn(CharSequence sequence) {
221              return (sequence.length() == 0) ? -1 : 0;
222            }
223    
224            
225            @Override
226            public int indexIn(CharSequence sequence, int start) {
227              int length = sequence.length();
228              Preconditions.checkPositionIndex(start, length);
229              return (start == length) ? -1 : start;
230            }
231    
232            
233            @Override
234            public int lastIndexIn(CharSequence sequence) {
235              return sequence.length() - 1;
236            }
237    
238            
239            @Override
240            public boolean matchesAllOf(CharSequence sequence) {
241              checkNotNull(sequence);
242              return true;
243            }
244    
245            
246            @Override
247            public boolean matchesNoneOf(CharSequence sequence) {
248              return sequence.length() == 0;
249            }
250    
251            
252            @Override
253            public String removeFrom(CharSequence sequence) {
254              checkNotNull(sequence);
255              return "";
256            }
257    
258            
259            @Override
260            public String replaceFrom(CharSequence sequence, char replacement) {
261              char[] array = new char[sequence.length()];
262              Arrays.fill(array, replacement);
263              return new String(array);
264            }
265    
266            
267            @Override
268            public String replaceFrom(CharSequence sequence, CharSequence replacement) {
269              StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
270              for (int i = 0; i < sequence.length(); i++) {
271                retval.append(replacement);
272              }
273              return retval.toString();
274            }
275    
276            
277            @Override
278            public String collapseFrom(CharSequence sequence, char replacement) {
279              return (sequence.length() == 0) ? "" : String.valueOf(replacement);
280            }
281    
282            
283            @Override
284            public String trimFrom(CharSequence sequence) {
285              checkNotNull(sequence);
286              return "";
287            }
288    
289            
290            @Override
291            public int countIn(CharSequence sequence) {
292              return sequence.length();
293            }
294    
295            
296            @Override
297            public CharMatcher and(CharMatcher other) {
298              return checkNotNull(other);
299            }
300    
301            
302            @Override
303            public CharMatcher or(CharMatcher other) {
304              checkNotNull(other);
305              return this;
306            }
307    
308            
309            @Override
310            public CharMatcher negate() {
311              return NONE;
312            }
313    
314            
315            @Override
316            public CharMatcher precomputed() {
317              return this;
318            }
319          };
320    
321      /** Matches no characters. */
322      public static final CharMatcher NONE =
323          new CharMatcher("CharMatcher.NONE") {
324            @Override
325            public boolean matches(char c) {
326              return false;
327            }
328    
329            
330            @Override
331            public int indexIn(CharSequence sequence) {
332              checkNotNull(sequence);
333              return -1;
334            }
335    
336            
337            @Override
338            public int indexIn(CharSequence sequence, int start) {
339              int length = sequence.length();
340              Preconditions.checkPositionIndex(start, length);
341              return -1;
342            }
343    
344            
345            @Override
346            public int lastIndexIn(CharSequence sequence) {
347              checkNotNull(sequence);
348              return -1;
349            }
350    
351            
352            @Override
353            public boolean matchesAllOf(CharSequence sequence) {
354              return sequence.length() == 0;
355            }
356    
357            
358            @Override
359            public boolean matchesNoneOf(CharSequence sequence) {
360              checkNotNull(sequence);
361              return true;
362            }
363    
364            
365            @Override
366            public String removeFrom(CharSequence sequence) {
367              return sequence.toString();
368            }
369    
370            
371            @Override
372            public String replaceFrom(CharSequence sequence, char replacement) {
373              return sequence.toString();
374            }
375    
376            
377            @Override
378            public String replaceFrom(CharSequence sequence, CharSequence replacement) {
379              checkNotNull(replacement);
380              return sequence.toString();
381            }
382    
383            
384            @Override
385            public String collapseFrom(CharSequence sequence, char replacement) {
386              return sequence.toString();
387            }
388    
389            
390            @Override
391            public String trimFrom(CharSequence sequence) {
392              return sequence.toString();
393            }
394    
395            
396            @Override
397            public int countIn(CharSequence sequence) {
398              checkNotNull(sequence);
399              return 0;
400            }
401    
402            
403            @Override
404            public CharMatcher and(CharMatcher other) {
405              checkNotNull(other);
406              return this;
407            }
408    
409            
410            @Override
411            public CharMatcher or(CharMatcher other) {
412              return checkNotNull(other);
413            }
414    
415            
416            @Override
417            public CharMatcher negate() {
418              return ANY;
419            }
420    
421            
422            @Override
423            void setBits(LookupTable table) {}
424    
425            
426            @Override
427            public CharMatcher precomputed() {
428              return this;
429            }
430          };
431    
432      // Static factories
433    
434      /**
435       * Returns a {@code char} matcher that matches only one specified character.
436       */
437      public static CharMatcher is(final char match) {
438        String description = new StringBuilder("CharMatcher.is(")
439            .append(Integer.toHexString(match))
440            .append(")")
441            .toString();
442        return new CharMatcher(description) {
443          @Override
444          public boolean matches(char c) {
445            return c == match;
446          }
447    
448          
449          @Override
450          public String replaceFrom(CharSequence sequence, char replacement) {
451            return sequence.toString().replace(match, replacement);
452          }
453    
454          
455          @Override
456          public CharMatcher and(CharMatcher other) {
457            return other.matches(match) ? this : NONE;
458          }
459    
460          
461          @Override
462          public CharMatcher or(CharMatcher other) {
463            return other.matches(match) ? other : super.or(other);
464          }
465    
466          
467          @Override
468          public CharMatcher negate() {
469            return isNot(match);
470          }
471    
472          
473          @Override
474          void setBits(LookupTable table) {
475            table.set(match);
476          }
477    
478          
479          @Override
480          public CharMatcher precomputed() {
481            return this;
482          }
483        };
484      }
485    
486      /**
487       * Returns a {@code char} matcher that matches any character except the one specified.
488       *
489       * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
490       */
491      public static CharMatcher isNot(final char match) {
492        String description = new StringBuilder("CharMatcher.isNot(")
493            .append(Integer.toHexString(match))
494            .append(")")
495            .toString();
496        return new CharMatcher(description) {
497          @Override
498          public boolean matches(char c) {
499            return c != match;
500          }
501    
502          
503          @Override
504          public CharMatcher and(CharMatcher other) {
505            return other.matches(match) ? super.and(other) : other;
506          }
507    
508          
509          @Override
510          public CharMatcher or(CharMatcher other) {
511            return other.matches(match) ? ANY : this;
512          }
513    
514          
515          @Override
516          public CharMatcher negate() {
517            return is(match);
518          }
519        };
520      }
521    
522      /**
523       * Returns a {@code char} matcher that matches any character present in the given character
524       * sequence.
525       */
526      public static CharMatcher anyOf(final CharSequence sequence) {
527        switch (sequence.length()) {
528          case 0:
529            return NONE;
530          case 1:
531            return is(sequence.charAt(0));
532          case 2:
533            final char match1 = sequence.charAt(0);
534            final char match2 = sequence.charAt(1);
535            return new CharMatcher(
536                new StringBuilder("CharMatcher.anyOf(\"").append(sequence).append("\")").toString()) {
537              @Override
538              public boolean matches(char c) {
539                return c == match1 || c == match2;
540              }
541    
542              
543              @Override
544              void setBits(LookupTable table) {
545                table.set(match1);
546                table.set(match2);
547              }
548    
549              
550              @Override
551              public CharMatcher precomputed() {
552                return this;
553              }
554            };
555        }
556        final char[] chars = sequence.toString().toCharArray();
557        Arrays.sort(chars);
558    
559        return new CharMatcher(new StringBuilder("CharMatcher.anyOf(\"").append(chars)
560            .append("\")").toString()) {
561              @Override
562              public boolean matches(char c) {
563                return Arrays.binarySearch(chars, c) >= 0;
564              }
565        };
566      }
567    
568      /**
569       * Returns a {@code char} matcher that matches any character not present in the given character
570       * sequence.
571       */
572      public static CharMatcher noneOf(CharSequence sequence) {
573        return anyOf(sequence).negate();
574      }
575    
576      /**
577       * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
578       * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
579       * CharMatcher.inRange('a', 'z')}.
580       *
581       * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
582       */
583      public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
584        checkArgument(endInclusive >= startInclusive);
585        String description = new StringBuilder("CharMatcher.inRange(")
586            .append(Integer.toHexString(startInclusive))
587            .append(", ")
588            .append(Integer.toHexString(endInclusive))
589            .append(")")
590            .toString();
591        return inRange(startInclusive, endInclusive, description);
592      }
593    
594      static CharMatcher inRange(final char startInclusive, final char endInclusive,
595          String description) {
596        return new CharMatcher(description) {
597          @Override
598          public boolean matches(char c) {
599            return startInclusive <= c && c <= endInclusive;
600          }
601    
602          
603          @Override
604          void setBits(LookupTable table) {
605            char c = startInclusive;
606            while (true) {
607              table.set(c);
608              if (c++ == endInclusive) {
609                break;
610              }
611            }
612          }
613    
614          
615          @Override
616          public CharMatcher precomputed() {
617            return this;
618          }
619        };
620      }
621    
622      /**
623       * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
624       * which operates on primitive {@code char} instances instead.
625       */
626      public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
627        checkNotNull(predicate);
628        if (predicate instanceof CharMatcher) {
629          return (CharMatcher) predicate;
630        }
631        String description = new StringBuilder("CharMatcher.forPredicate(")
632            .append(predicate)
633            .append(')')
634            .toString();
635        return new CharMatcher(description) {
636          @Override
637          public boolean matches(char c) {
638            return predicate.apply(c);
639          }
640    
641          
642          @Override
643          public boolean apply(Character character) {
644            return predicate.apply(checkNotNull(character));
645          }
646        };
647      }
648    
649      // State
650      final String description;
651    
652      // Constructors
653    
654      /**
655       * Sets the {@code toString()} from the given description.
656       */
657      CharMatcher(String description) {
658        this.description = description;
659      }
660    
661      /**
662       * Constructor for use by subclasses. When subclassing, you may want to override
663       * {@code toString()} to provide a useful description.
664       */
665      protected CharMatcher() {
666        description = "UnknownCharMatcher";
667      }
668    
669      // Abstract methods
670    
671      /** Determines a true or false value for the given character. */
672      public abstract boolean matches(char c);
673    
674      // Non-static factories
675    
676      /**
677       * Returns a matcher that matches any character not matched by this matcher.
678       */
679      public CharMatcher negate() {
680        final CharMatcher original = this;
681        return new CharMatcher(original + ".negate()") {
682          @Override
683          public boolean matches(char c) {
684            return !original.matches(c);
685          }
686    
687          
688          @Override
689          public boolean matchesAllOf(CharSequence sequence) {
690            return original.matchesNoneOf(sequence);
691          }
692    
693          
694          @Override
695          public boolean matchesNoneOf(CharSequence sequence) {
696            return original.matchesAllOf(sequence);
697          }
698    
699          
700          @Override
701          public int countIn(CharSequence sequence) {
702            return sequence.length() - original.countIn(sequence);
703          }
704    
705          
706          @Override
707          public CharMatcher negate() {
708            return original;
709          }
710        };
711      }
712    
713      /**
714       * Returns a matcher that matches any character matched by both this matcher and {@code other}.
715       */
716      public CharMatcher and(CharMatcher other) {
717        return new And(this, checkNotNull(other));
718      }
719    
720      private static class And extends CharMatcher {
721        final CharMatcher first;
722        final CharMatcher second;
723    
724        And(CharMatcher a, CharMatcher b) {
725          this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
726        }
727    
728        And(CharMatcher a, CharMatcher b, String description) {
729          super(description);
730          first = checkNotNull(a);
731          second = checkNotNull(b);
732        }
733    
734        
735        @Override
736        public CharMatcher and(CharMatcher other) {
737          return new And(this, other);
738        }
739    
740        
741        @Override
742        public boolean matches(char c) {
743          return first.matches(c) && second.matches(c);
744        }
745    
746        
747        @Override
748        CharMatcher withToString(String description) {
749          return new And(first, second, description);
750        }
751      }
752    
753      /**
754       * Returns a matcher that matches any character matched by either this matcher or {@code other}.
755       */
756      public CharMatcher or(CharMatcher other) {
757        return new Or(this, checkNotNull(other));
758      }
759    
760      private static class Or extends CharMatcher {
761        final CharMatcher first;
762        final CharMatcher second;
763    
764        Or(CharMatcher a, CharMatcher b, String description) {
765          super(description);
766          first = checkNotNull(a);
767          second = checkNotNull(b);
768        }
769    
770        Or(CharMatcher a, CharMatcher b) {
771          this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
772        }
773    
774        
775        @Override
776        public CharMatcher or(CharMatcher other) {
777          return new Or(this, checkNotNull(other));
778        }
779    
780        
781        @Override
782        public boolean matches(char c) {
783          return first.matches(c) || second.matches(c);
784        }
785    
786        
787        @Override
788        CharMatcher withToString(String description) {
789          return new Or(first, second, description);
790        }
791      }
792    
793      /**
794       * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
795       * query than the original; your mileage may vary. Precomputation takes time and is likely to be
796       * worthwhile only if the precomputed matcher is queried many thousands of times.
797       *
798       * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
799       * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
800       * worthwhile tradeoff in a browser.
801       */
802      public CharMatcher precomputed() {
803        return Platform.precomputeCharMatcher(this);
804      }
805    
806      /**
807       * Construct an array of all possible chars in the slowest way possible.
808       */
809      char[] slowGetChars() {
810        char[] allChars = new char[65536];
811        int size = 0;
812        for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
813          if (matches((char) c)) {
814            allChars[size++] = (char) c;
815          }
816        }
817        char[] retValue = new char[size];
818        System.arraycopy(allChars, 0, retValue, 0, size);
819        return retValue;
820      }
821    
822      /**
823       * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
824       * on {@link Platform} so that we can have different behavior in GWT.
825       *
826       * <p>If the number of matched characters is small enough, we try to build a small hash
827       * table to contain all of the characters. Otherwise, we record the characters in eight-kilobyte
828       * bit array. In many situations this produces a matcher which is faster to query
829       * than the original.
830       */
831      CharMatcher precomputedInternal() {
832        final char[] chars = slowGetChars();
833        int totalCharacters = chars.length;
834        if (totalCharacters == 0) {
835          return NONE;
836        } else if (totalCharacters == 1) {
837          return is(chars[0]);
838        } else if (totalCharacters < SmallCharMatcher.MAX_SIZE) {
839          return SmallCharMatcher.from(chars, toString());
840        } else if (totalCharacters < MediumCharMatcher.MAX_SIZE) {
841          return MediumCharMatcher.from(chars, toString());
842        }
843        // Otherwise, make the full lookup table.
844        final LookupTable table = new LookupTable();
845        setBits(table);
846        final CharMatcher outer = this;
847    
848        return new CharMatcher(outer.toString()) {
849          @Override
850          public boolean matches(char c) {
851            return table.get(c);
852          }
853    
854          // TODO(kevinb): make methods like negate() smart?
855    
856          
857          @Override
858          public CharMatcher precomputed() {
859            return this;
860          }
861        };
862      }
863    
864      /**
865       * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
866       * but with their {@code toString} method overridden with the new description.
867       *
868       * <p>This is unsupported by default.
869       */
870      CharMatcher withToString(String description) {
871        throw new UnsupportedOperationException();
872    
873      }
874    
875      /**
876       * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
877       * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
878       *
879       * <p>The default implementation loops over every possible character value, invoking {@link
880       * #matches} for each one.
881       */
882      void setBits(LookupTable table) {
883        char c = Character.MIN_VALUE;
884        while (true) {
885          if (matches(c)) {
886            table.set(c);
887          }
888          if (c++ == Character.MAX_VALUE) {
889            break;
890          }
891        }
892      }
893    
894      /**
895       * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
896       *
897       * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
898       * simpler java.util.BitSet.
899       */
900      private static final class LookupTable {
901        int[] data = new int[2048];
902    
903        void set(char index) {
904          data[index >> 5] |= (1 << index);
905        }
906    
907        boolean get(char index) {
908          return (data[index >> 5] & (1 << index)) != 0;
909        }
910      }
911    
912      // Text processing routines
913    
914      /**
915       * Returns {@code true} if a character sequence contains at least one matching character.
916       * Equivalent to {@code !matchesNoneOf(sequence)}.
917       *
918       * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
919       * character, until this returns {@code true} or the end is reached.
920       *
921       * @param sequence the character sequence to examine, possibly empty
922       * @return {@code true} if this matcher matches at least one character in the sequence
923       * @since 8.0
924       */
925      public boolean matchesAnyOf(CharSequence sequence) {
926        return !matchesNoneOf(sequence);
927      }
928    
929      /**
930       * Returns {@code true} if a character sequence contains only matching characters.
931       *
932       * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
933       * character, until this returns {@code false} or the end is reached.
934       *
935       * @param sequence the character sequence to examine, possibly empty
936       * @return {@code true} if this matcher matches every character in the sequence, including when
937       *         the sequence is empty
938       */
939      public boolean matchesAllOf(CharSequence sequence) {
940        for (int i = sequence.length() - 1; i >= 0; i--) {
941          if (!matches(sequence.charAt(i))) {
942            return false;
943          }
944        }
945        return true;
946      }
947    
948      /**
949       * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
950       * {@code !matchesAnyOf(sequence)}.
951       *
952       * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
953       * character, until this returns {@code false} or the end is reached.
954       *
955       * @param sequence the character sequence to examine, possibly empty
956       * @return {@code true} if this matcher matches every character in the sequence, including when
957       *         the sequence is empty
958       */
959      public boolean matchesNoneOf(CharSequence sequence) {
960        return indexIn(sequence) == -1;
961      }
962    
963      /**
964       * Returns the index of the first matching character in a character sequence, or {@code -1} if no
965       * matching character is present.
966       *
967       * <p>The default implementation iterates over the sequence in forward order calling {@link
968       * #matches} for each character.
969       *
970       * @param sequence the character sequence to examine from the beginning
971       * @return an index, or {@code -1} if no character matches
972       */
973      public int indexIn(CharSequence sequence) {
974        int length = sequence.length();
975        for (int i = 0; i < length; i++) {
976          if (matches(sequence.charAt(i))) {
977            return i;
978          }
979        }
980        return -1;
981      }
982    
983      /**
984       * Returns the index of the first matching character in a character sequence, starting from a
985       * given position, or {@code -1} if no character matches after that position.
986       *
987       * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
988       * start}, calling {@link #matches} for each character.
989       *
990       * @param sequence the character sequence to examine
991       * @param start the first index to examine; must be nonnegative and no greater than {@code
992       *        sequence.length()}
993       * @return the index of the first matching character, guaranteed to be no less than {@code start},
994       *         or {@code -1} if no character matches
995       * @throws IndexOutOfBoundsException if start is negative or greater than {@code
996       *         sequence.length()}
997       */
998      public int indexIn(CharSequence sequence, int start) {
999        int length = sequence.length();
1000        Preconditions.checkPositionIndex(start, length);
1001        for (int i = start; i < length; i++) {
1002          if (matches(sequence.charAt(i))) {
1003            return i;
1004          }
1005        }
1006        return -1;
1007      }
1008    
1009      /**
1010       * Returns the index of the last matching character in a character sequence, or {@code -1} if no
1011       * matching character is present.
1012       *
1013       * <p>The default implementation iterates over the sequence in reverse order calling {@link
1014       * #matches} for each character.
1015       *
1016       * @param sequence the character sequence to examine from the end
1017       * @return an index, or {@code -1} if no character matches
1018       */
1019      public int lastIndexIn(CharSequence sequence) {
1020        for (int i = sequence.length() - 1; i >= 0; i--) {
1021          if (matches(sequence.charAt(i))) {
1022            return i;
1023          }
1024        }
1025        return -1;
1026      }
1027    
1028      /**
1029       * Returns the number of matching characters found in a character sequence.
1030       */
1031      public int countIn(CharSequence sequence) {
1032        int count = 0;
1033        for (int i = 0; i < sequence.length(); i++) {
1034          if (matches(sequence.charAt(i))) {
1035            count++;
1036          }
1037        }
1038        return count;
1039      }
1040    
1041      /**
1042       * Returns a string containing all non-matching characters of a character sequence, in order. For
1043       * example: <pre>   {@code
1044       *
1045       *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
1046       *
1047       * ... returns {@code "bzr"}.
1048       */
1049      @CheckReturnValue
1050      public String removeFrom(CharSequence sequence) {
1051        String string = sequence.toString();
1052        int pos = indexIn(string);
1053        if (pos == -1) {
1054          return string;
1055        }
1056    
1057        char[] chars = string.toCharArray();
1058        int spread = 1;
1059    
1060        // This unusual loop comes from extensive benchmarking
1061        OUT: while (true) {
1062          pos++;
1063          while (true) {
1064            if (pos == chars.length) {
1065              break OUT;
1066            }
1067            if (matches(chars[pos])) {
1068              break;
1069            }
1070            chars[pos - spread] = chars[pos];
1071            pos++;
1072          }
1073          spread++;
1074        }
1075        return new String(chars, 0, pos - spread);
1076      }
1077    
1078      /**
1079       * Returns a string containing all matching characters of a character sequence, in order. For
1080       * example: <pre>   {@code
1081       *
1082       *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
1083       *
1084       * ... returns {@code "aaa"}.
1085       */
1086      @CheckReturnValue
1087      public String retainFrom(CharSequence sequence) {
1088        return negate().removeFrom(sequence);
1089      }
1090    
1091      /**
1092       * Returns a string copy of the input character sequence, with each character that matches this
1093       * matcher replaced by a given replacement character. For example: <pre>   {@code
1094       *
1095       *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
1096       *
1097       * ... returns {@code "rodor"}.
1098       *
1099       * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1100       * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1101       * character.
1102       *
1103       * @param sequence the character sequence to replace matching characters in
1104       * @param replacement the character to append to the result string in place of each matching
1105       *        character in {@code sequence}
1106       * @return the new string
1107       */
1108      @CheckReturnValue
1109      public String replaceFrom(CharSequence sequence, char replacement) {
1110        String string = sequence.toString();
1111        int pos = indexIn(string);
1112        if (pos == -1) {
1113          return string;
1114        }
1115        char[] chars = string.toCharArray();
1116        chars[pos] = replacement;
1117        for (int i = pos + 1; i < chars.length; i++) {
1118          if (matches(chars[i])) {
1119            chars[i] = replacement;
1120          }
1121        }
1122        return new String(chars);
1123      }
1124    
1125      /**
1126       * Returns a string copy of the input character sequence, with each character that matches this
1127       * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1128       *
1129       *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1130       *
1131       * ... returns {@code "yoohoo"}.
1132       *
1133       * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1134       * off calling {@link #replaceFrom(CharSequence, char)} directly.
1135       *
1136       * @param sequence the character sequence to replace matching characters in
1137       * @param replacement the characters to append to the result string in place of each matching
1138       *        character in {@code sequence}
1139       * @return the new string
1140       */
1141      @CheckReturnValue
1142      public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1143        int replacementLen = replacement.length();
1144        if (replacementLen == 0) {
1145          return removeFrom(sequence);
1146        }
1147        if (replacementLen == 1) {
1148          return replaceFrom(sequence, replacement.charAt(0));
1149        }
1150    
1151        String string = sequence.toString();
1152        int pos = indexIn(string);
1153        if (pos == -1) {
1154          return string;
1155        }
1156    
1157        int len = string.length();
1158        StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1159    
1160        int oldpos = 0;
1161        do {
1162          buf.append(string, oldpos, pos);
1163          buf.append(replacement);
1164          oldpos = pos + 1;
1165          pos = indexIn(string, oldpos);
1166        } while (pos != -1);
1167    
1168        buf.append(string, oldpos, len);
1169        return buf.toString();
1170      }
1171    
1172      /**
1173       * Returns a substring of the input character sequence that omits all characters this matcher
1174       * matches from the beginning and from the end of the string. For example: <pre>   {@code
1175       *
1176       *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1177       *
1178       * ... returns {@code "cat"}.
1179       *
1180       * <p>Note that: <pre>   {@code
1181       *
1182       *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1183       *
1184       * ... is equivalent to {@link String#trim()}.
1185       */
1186      @CheckReturnValue
1187      public String trimFrom(CharSequence sequence) {
1188        int len = sequence.length();
1189        int first;
1190        int last;
1191    
1192        for (first = 0; first < len; first++) {
1193          if (!matches(sequence.charAt(first))) {
1194            break;
1195          }
1196        }
1197        for (last = len - 1; last > first; last--) {
1198          if (!matches(sequence.charAt(last))) {
1199            break;
1200          }
1201        }
1202    
1203        return sequence.subSequence(first, last + 1).toString();
1204      }
1205    
1206      /**
1207       * Returns a substring of the input character sequence that omits all characters this matcher
1208       * matches from the beginning of the string. For example: <pre> {@code
1209       *
1210       *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1211       *
1212       * ... returns {@code "catbab"}.
1213       */
1214      @CheckReturnValue
1215      public String trimLeadingFrom(CharSequence sequence) {
1216        int len = sequence.length();
1217        int first;
1218    
1219        for (first = 0; first < len; first++) {
1220          if (!matches(sequence.charAt(first))) {
1221            break;
1222          }
1223        }
1224    
1225        return sequence.subSequence(first, len).toString();
1226      }
1227    
1228      /**
1229       * Returns a substring of the input character sequence that omits all characters this matcher
1230       * matches from the end of the string. For example: <pre> {@code
1231       *
1232       *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1233       *
1234       * ... returns {@code "abacat"}.
1235       */
1236      @CheckReturnValue
1237      public String trimTrailingFrom(CharSequence sequence) {
1238        int len = sequence.length();
1239        int last;
1240    
1241        for (last = len - 1; last >= 0; last--) {
1242          if (!matches(sequence.charAt(last))) {
1243            break;
1244          }
1245        }
1246    
1247        return sequence.subSequence(0, last + 1).toString();
1248      }
1249    
1250      /**
1251       * Returns a string copy of the input character sequence, with each group of consecutive
1252       * characters that match this matcher replaced by a single replacement character. For example:
1253       * <pre>   {@code
1254       *
1255       *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1256       *
1257       * ... returns {@code "b-p-r"}.
1258       *
1259       * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1260       * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1261       * character.
1262       *
1263       * @param sequence the character sequence to replace matching groups of characters in
1264       * @param replacement the character to append to the result string in place of each group of
1265       *        matching characters in {@code sequence}
1266       * @return the new string
1267       */
1268      @CheckReturnValue
1269      public String collapseFrom(CharSequence sequence, char replacement) {
1270        int first = indexIn(sequence);
1271        if (first == -1) {
1272          return sequence.toString();
1273        }
1274    
1275        // TODO(kevinb): see if this implementation can be made faster
1276        StringBuilder builder = new StringBuilder(sequence.length())
1277            .append(sequence.subSequence(0, first))
1278            .append(replacement);
1279        boolean in = true;
1280        for (int i = first + 1; i < sequence.length(); i++) {
1281          char c = sequence.charAt(i);
1282          if (matches(c)) {
1283            if (!in) {
1284              builder.append(replacement);
1285              in = true;
1286            }
1287          } else {
1288            builder.append(c);
1289            in = false;
1290          }
1291        }
1292        return builder.toString();
1293      }
1294    
1295      /**
1296       * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1297       * groups of matching characters at the start or end of the sequence are removed without
1298       * replacement.
1299       */
1300      @CheckReturnValue
1301      public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1302        int first = negate().indexIn(sequence);
1303        if (first == -1) {
1304          return ""; // everything matches. nothing's left.
1305        }
1306        StringBuilder builder = new StringBuilder(sequence.length());
1307        boolean inMatchingGroup = false;
1308        for (int i = first; i < sequence.length(); i++) {
1309          char c = sequence.charAt(i);
1310          if (matches(c)) {
1311            inMatchingGroup = true;
1312          } else {
1313            if (inMatchingGroup) {
1314              builder.append(replacement);
1315              inMatchingGroup = false;
1316            }
1317            builder.append(c);
1318          }
1319        }
1320        return builder.toString();
1321      }
1322    
1323      // Predicate interface
1324    
1325      /**
1326       * Returns {@code true} if this matcher matches the given character.
1327       *
1328       * @throws NullPointerException if {@code character} is null
1329       */
1330      public boolean apply(Character character) {
1331        return matches(character);
1332      }
1333    
1334      /**
1335       * Returns a string representation of this {@code CharMatcher}, such as
1336       * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1337       */
1338      
1339      @Override
1340      public String toString() {
1341        return description;
1342      }
1343    
1344      /**
1345       * Determines whether a character is whitespace according to the latest Unicode standard, as
1346       * illustrated
1347       * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1348       * This is not the same definition used by other Java APIs. (See a
1349       * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1350       * definitions of "whitespace"</a>.)
1351       *
1352       * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1353       * to date.
1354       */
1355      public static final CharMatcher WHITESPACE = new CharMatcher("CharMatcher.WHITESPACE") {
1356        /**
1357         * A special-case CharMatcher for Unicode whitespace characters that is extremely
1358         * efficient both in space required and in time to check for matches.
1359         *
1360         * Implementation details.
1361         * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1362         * so we can construct a lookup table of exactly 79 entries, and just check the character code
1363         * mod 79, and see if that character is in the table.
1364         *
1365         * There is a 1 at the beginning of the table so that the null character is not listed
1366         * as whitespace.
1367         *
1368         * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1369         *
1370         *   * Binary search into the sorted list of characters, i.e., what
1371         *     CharMatcher.anyOf() does</li>
1372         *   * Perfect hash function into a table of size 26 (using an offset table and a special
1373         *     Jenkins hash function)</li>
1374         *   * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1375         *   * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1376         *
1377         * --Christopher Swenson, February 2012.
1378         */
1379    
1380        // Mod-79 lookup table.
1381        private final char[] table = {1, 0, 160, 0, 0, 0, 0, 0, 0, 9, 10, 11, 12, 13, 0, 0,
1382            8232, 8233, 0, 0, 0, 0, 0, 8239, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1383            12288, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199,
1384            8200, 8201, 8202, 0, 0, 0, 0, 0, 8287, 5760, 0, 0, 6158, 0, 0, 0};
1385    
1386        
1387        @Override
1388        public boolean matches(char c) {
1389          return table[c % 79] == c;
1390        }
1391    
1392        
1393        @Override
1394        public CharMatcher precomputed() {
1395          return this;
1396        }
1397      };
1398    }