1   /*
2    * %W% %E%
3    *
4    * Copyright (c) 2006, Oracle and/or its affiliates. All rights reserved.
5    * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6    */
7   
8   package java.util.regex;
9   
10  
11  /**
12   * An engine that performs match operations on a {@link java.lang.CharSequence
13   * </code>character sequence<code>} by interpreting a {@link Pattern}.
14   *
15   * <p> A matcher is created from a pattern by invoking the pattern's {@link
16   * Pattern#matcher matcher} method.  Once created, a matcher can be used to
17   * perform three different kinds of match operations:
18   *
19   * <ul>
20   *
21   *   <li><p> The {@link #matches matches} method attempts to match the entire
22   *   input sequence against the pattern.  </p></li>
23   *
24   *   <li><p> The {@link #lookingAt lookingAt} method attempts to match the
25   *   input sequence, starting at the beginning, against the pattern.  </p></li>
26   *
27   *   <li><p> The {@link #find find} method scans the input sequence looking for
28   *   the next subsequence that matches the pattern.  </p></li>
29   *
30   * </ul>
31   *
32   * <p> Each of these methods returns a boolean indicating success or failure.
33   * More information about a successful match can be obtained by querying the
34   * state of the matcher.
35   *
36   * <p> A matcher finds matches in a subset of its input called the 
37   * <i>region</i>. By default, the region contains all of the matcher's input. 
38   * The region can be modified via the{@link #region region} method and queried
39   * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd} 
40   * methods. The way that the region boundaries interact with some pattern
41   * constructs can be changed. See {@link #useAnchoringBounds 
42   * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
43   * for more details.
44   *
45   * <p> This class also defines methods for replacing matched subsequences with
46   * new strings whose contents can, if desired, be computed from the match
47   * result.  The {@link #appendReplacement appendReplacement} and {@link
48   * #appendTail appendTail} methods can be used in tandem in order to collect
49   * the result into an existing string buffer, or the more convenient {@link
50   * #replaceAll replaceAll} method can be used to create a string in which every
51   * matching subsequence in the input sequence is replaced.
52   *
53   * <p> The explicit state of a matcher includes the start and end indices of
54   * the most recent successful match.  It also includes the start and end
55   * indices of the input subsequence captured by each <a
56   * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
57   * count of such subsequences.  As a convenience, methods are also provided for
58   * returning these captured subsequences in string form.
59   *
60   * <p> The explicit state of a matcher is initially undefined; attempting to
61   * query any part of it before a successful match will cause an {@link
62   * IllegalStateException} to be thrown.  The explicit state of a matcher is
63   * recomputed by every match operation.
64   *
65   * <p> The implicit state of a matcher includes the input character sequence as
66   * well as the <i>append position</i>, which is initially zero and is updated
67   * by the {@link #appendReplacement appendReplacement} method.
68   *
69   * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
70   * method or, if a new input sequence is desired, its {@link
71   * #reset(java.lang.CharSequence) reset(CharSequence)} method.  Resetting a
72   * matcher discards its explicit state information and sets the append position
73   * to zero.
74   *
75   * <p> Instances of this class are not safe for use by multiple concurrent
76   * threads. </p>
77   *
78   *
79   * @author      Mike McCloskey
80   * @author  Mark Reinhold
81   * @author  JSR-51 Expert Group
82   * @version     %I%, %E%
83   * @since   1.4
84   * @spec        JSR-51
85   */
86  
87  public final class Matcher implements MatchResult {
88  
89      /**
90       * The Pattern object that created this Matcher.
91       */
92      Pattern parentPattern;
93  
94      /**
95       * The storage used by groups. They may contain invalid values if
96       * a group was skipped during the matching.
97       */
98      int[] groups;
99  
100     /**
101      * The range within the sequence that is to be matched. Anchors
102      * will match at these "hard" boundaries. Changing the region
103      * changes these values.
104      */
105     int from, to;
106 
107     /**
108      * Lookbehind uses this value to ensure that the subexpression
109      * match ends at the point where the lookbehind was encountered.
110      */
111     int lookbehindTo;
112 
113     /**
114      * The original string being matched.
115      */
116     CharSequence text;
117 
118     /**
119      * Matcher state used by the last node. NOANCHOR is used when a
120      * match does not have to consume all of the input. ENDANCHOR is
121      * the mode used for matching all the input.
122      */
123     static final int ENDANCHOR = 1;
124     static final int NOANCHOR = 0;
125     int acceptMode = NOANCHOR;
126 
127     /**
128      * The range of string that last matched the pattern. If the last
129      * match failed then first is -1; last initially holds 0 then it
130      * holds the index of the end of the last match (which is where the
131      * next search starts).
132      */
133     int first = -1, last = 0;
134 
135     /**
136      * The end index of what matched in the last match operation.
137      */
138     int oldLast = -1;
139 
140     /**
141      * The index of the last position appended in a substitution.
142      */
143     int lastAppendPosition = 0;
144 
145     /**
146      * Storage used by nodes to tell what repetition they are on in
147      * a pattern, and where groups begin. The nodes themselves are stateless,
148      * so they rely on this field to hold state during a match.
149      */
150     int[] locals;
151 
152     /**
153      * Boolean indicating whether or not more input could change
154      * the results of the last match. 
155      * 
156      * If hitEnd is true, and a match was found, then more input
157      * might cause a different match to be found.
158      * If hitEnd is true and a match was not found, then more
159      * input could cause a match to be found.
160      * If hitEnd is false and a match was found, then more input
161      * will not change the match.
162      * If hitEnd is false and a match was not found, then more
163      * input will not cause a match to be found.
164      */
165     boolean hitEnd;
166 
167     /**
168      * Boolean indicating whether or not more input could change
169      * a positive match into a negative one.
170      *
171      * If requireEnd is true, and a match was found, then more
172      * input could cause the match to be lost.
173      * If requireEnd is false and a match was found, then more
174      * input might change the match but the match won't be lost.
175      * If a match was not found, then requireEnd has no meaning.
176      */
177     boolean requireEnd;
178 
179     /**
180      * If transparentBounds is true then the boundaries of this
181      * matcher's region are transparent to lookahead, lookbehind,
182      * and boundary matching constructs that try to see beyond them.
183      */
184     boolean transparentBounds = false;
185 
186     /**
187      * If anchoringBounds is true then the boundaries of this 
188      * matcher's region match anchors such as ^ and $.
189      */
190     boolean anchoringBounds = true;
191 
192     /**
193      * No default constructor.
194      */
195     Matcher() {
196     }
197 
198     /**
199      * All matchers have the state used by Pattern during a match.
200      */
201     Matcher(Pattern parent, CharSequence text) {
202         this.parentPattern = parent;
203         this.text = text;
204 
205         // Allocate state storage
206         int parentGroupCount = Math.max(parent.capturingGroupCount, 10);
207         groups = new int[parentGroupCount * 2];
208         locals = new int[parent.localCount];
209 
210         // Put fields into initial states
211         reset();
212     }
213 
214     /**
215      * Returns the pattern that is interpreted by this matcher.
216      *
217      * @return  The pattern for which this matcher was created
218      */
219     public Pattern pattern() {
220         return parentPattern;
221     }
222 
223     /**
224      * Returns the match state of this matcher as a {@link MatchResult}.
225      * The result is unaffected by subsequent operations performed upon this
226      * matcher.
227      *
228      * @return  a <code>MatchResult</code> with the state of this matcher
229      * @since 1.5
230      */
231     public MatchResult toMatchResult() {
232         Matcher result = new Matcher(this.parentPattern, text.toString());
233         result.first = this.first;
234         result.last = this.last;
235         result.groups = (int[])(this.groups.clone());
236         return result;
237     }
238 
239     /**
240       * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to
241       * find matches with.
242       *
243       * <p> This method causes this matcher to lose information
244       * about the groups of the last match that occurred. The
245       * matcher's position in the input is maintained and its
246       * last append position is unaffected.</p>
247       *
248       * @param  newPattern
249       *         The new pattern used by this matcher
250       * @return  This matcher
251       * @throws  IllegalArgumentException
252       *          If newPattern is <tt>null</tt>
253       * @since 1.5
254       */
255     public Matcher usePattern(Pattern newPattern) {
256         if (newPattern == null)
257             throw new IllegalArgumentException("Pattern cannot be null");
258         parentPattern = newPattern;
259      
260         // Reallocate state storage
261         int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10);
262         groups = new int[parentGroupCount * 2];
263         locals = new int[newPattern.localCount];
264         for (int i = 0; i < groups.length; i++)
265             groups[i] = -1;
266         for (int i = 0; i < locals.length; i++)
267             locals[i] = -1;
268         return this;
269     }
270 
271     /**
272      * Resets this matcher.
273      *
274      * <p> Resetting a matcher discards all of its explicit state information
275      * and sets its append position to zero. The matcher's region is set to the
276      * default region, which is its entire character sequence. The anchoring
277      * and transparency of this matcher's region boundaries are unaffected.
278      *
279      * @return  This matcher
280      */
281     public Matcher reset() {
282         first = -1;
283         last = 0;
284         oldLast = -1;
285         for(int i=0; i<groups.length; i++)
286             groups[i] = -1;
287         for(int i=0; i<locals.length; i++)
288             locals[i] = -1;
289         lastAppendPosition = 0;
290         from = 0;
291         to = getTextLength();
292     return this;
293     }
294 
295     /**
296      * Resets this matcher with a new input sequence.
297      *
298      * <p> Resetting a matcher discards all of its explicit state information
299      * and sets its append position to zero.  The matcher's region is set to
300      * the default region, which is its entire character sequence.  The 
301      * anchoring and transparency of this matcher's region boundaries are 
302      * unaffected.
303      *
304      * @param  input
305      *         The new input character sequence
306      *
307      * @return  This matcher
308      */
309     public Matcher reset(CharSequence input) {
310         text = input;
311         return reset();
312     }
313 
314     /**
315      * Returns the start index of the previous match.  </p>
316      *
317      * @return  The index of the first character matched
318      *
319      * @throws  IllegalStateException
320      *          If no match has yet been attempted,
321      *          or if the previous match operation failed
322      */
323     public int start() {
324         if (first < 0)
325             throw new IllegalStateException("No match available");
326         return first;
327     }
328 
329     /**
330      * Returns the start index of the subsequence captured by the given group
331      * during the previous match operation.
332      *
333      * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
334      * to right, starting at one.  Group zero denotes the entire pattern, so
335      * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
336      * <i>m.</i><tt>start()</tt>.  </p>
337      *
338      * @param  group
339      *         The index of a capturing group in this matcher's pattern
340      *
341      * @return  The index of the first character captured by the group,
342      *          or <tt>-1</tt> if the match was successful but the group
343      *          itself did not match anything
344      *
345      * @throws  IllegalStateException
346      *          If no match has yet been attempted,
347      *          or if the previous match operation failed
348      *
349      * @throws  IndexOutOfBoundsException
350      *          If there is no capturing group in the pattern
351      *          with the given index
352      */
353     public int start(int group) {
354         if (first < 0)
355             throw new IllegalStateException("No match available");
356         if (group > groupCount())
357             throw new IndexOutOfBoundsException("No group " + group);
358         return groups[group * 2];
359     }
360 
361     /**
362      * Returns the offset after the last character matched.  </p>
363      *
364      * @return  The offset after the last character matched
365      *
366      * @throws  IllegalStateException
367      *          If no match has yet been attempted,
368      *          or if the previous match operation failed
369      */
370     public int end() {
371         if (first < 0)
372             throw new IllegalStateException("No match available");
373         return last;
374     }
375 
376     /**
377      * Returns the offset after the last character of the subsequence
378      * captured by the given group during the previous match operation.
379      *
380      * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
381      * to right, starting at one.  Group zero denotes the entire pattern, so
382      * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
383      * <i>m.</i><tt>end()</tt>.  </p>
384      *
385      * @param  group
386      *         The index of a capturing group in this matcher's pattern
387      *
388      * @return  The offset after the last character captured by the group,
389      *          or <tt>-1</tt> if the match was successful
390      *          but the group itself did not match anything
391      *
392      * @throws  IllegalStateException
393      *          If no match has yet been attempted,
394      *          or if the previous match operation failed
395      *
396      * @throws  IndexOutOfBoundsException
397      *          If there is no capturing group in the pattern
398      *          with the given index
399      */
400     public int end(int group) {
401         if (first < 0)
402             throw new IllegalStateException("No match available");
403         if (group > groupCount())
404             throw new IndexOutOfBoundsException("No group " + group);
405         return groups[group * 2 + 1];
406     }
407 
408     /**
409      * Returns the input subsequence matched by the previous match.
410      *
411      * <p> For a matcher <i>m</i> with input sequence <i>s</i>, 
412      * the expressions <i>m.</i><tt>group()</tt> and
413      * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt>&nbsp;<i>m.</i><tt>end())</tt>
414      * are equivalent.  </p>
415      *
416      * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
417      * string.  This method will return the empty string when the pattern
418      * successfully matches the empty string in the input.  </p>
419      *
420      * @return The (possibly empty) subsequence matched by the previous match,
421      *         in string form
422      *
423      * @throws  IllegalStateException
424      *          If no match has yet been attempted,
425      *          or if the previous match operation failed
426      */
427     public String group() {
428         return group(0);
429     }
430 
431     /**
432      * Returns the input subsequence captured by the given group during the
433      * previous match operation.
434      *
435      * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
436      * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
437      * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt>&nbsp;<i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
438      * are equivalent.  </p>
439      * 
440      * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
441      * to right, starting at one.  Group zero denotes the entire pattern, so
442      * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
443      * </p>
444      *
445      * <p> If the match was successful but the group specified failed to match
446      * any part of the input sequence, then <tt>null</tt> is returned. Note
447      * that some groups, for example <tt>(a*)</tt>, match the empty string.
448      * This method will return the empty string when such a group successfully
449      * matches the empty string in the input.  </p>
450      *
451      * @param  group
452      *         The index of a capturing group in this matcher's pattern
453      *
454      * @return  The (possibly empty) subsequence captured by the group
455      *          during the previous match, or <tt>null</tt> if the group
456      *          failed to match part of the input
457      *
458      * @throws  IllegalStateException
459      *          If no match has yet been attempted,
460      *          or if the previous match operation failed
461      *
462      * @throws  IndexOutOfBoundsException
463      *          If there is no capturing group in the pattern
464      *          with the given index
465      */
466     public String group(int group) {
467         if (first < 0)
468             throw new IllegalStateException("No match found");
469         if (group < 0 || group > groupCount())
470             throw new IndexOutOfBoundsException("No group " + group);
471         if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
472             return null;
473         return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
474     }
475 
476     /**
477      * Returns the number of capturing groups in this matcher's pattern.
478      *
479      * <p> Group zero denotes the entire pattern by convention. It is not
480      * included in this count.
481      *
482      * <p> Any non-negative integer smaller than or equal to the value
483      * returned by this method is guaranteed to be a valid group index for
484      * this matcher.  </p>
485      *
486      * @return The number of capturing groups in this matcher's pattern
487      */
488     public int groupCount() {
489         return parentPattern.capturingGroupCount - 1;
490     }
491 
492     /**
493      * Attempts to match the entire region against the pattern.
494      *
495      * <p> If the match succeeds then more information can be obtained via the
496      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
497      *
498      * @return  <tt>true</tt> if, and only if, the entire region sequence
499      *          matches this matcher's pattern
500      */
501     public boolean matches() {
502         return match(from, ENDANCHOR);
503     }
504 
505     /**
506      * Attempts to find the next subsequence of the input sequence that matches
507      * the pattern.
508      *
509      * <p> This method starts at the beginning of this matcher's region, or, if
510      * a previous invocation of the method was successful and the matcher has 
511      * not since been reset, at the first character not matched by the previous
512      * match.
513      *
514      * <p> If the match succeeds then more information can be obtained via the
515      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
516      *
517      * @return  <tt>true</tt> if, and only if, a subsequence of the input
518      *          sequence matches this matcher's pattern
519      */
520     public boolean find() {
521         int nextSearchIndex = last;
522         if (nextSearchIndex == first)
523             nextSearchIndex++;
524 
525         // If next search starts before region, start it at region
526         if (nextSearchIndex < from)
527             nextSearchIndex = from;
528 
529         // If next search starts beyond region then it fails
530         if (nextSearchIndex > to) {
531             for (int i = 0; i < groups.length; i++)
532                 groups[i] = -1;
533             return false;
534         }
535         return search(nextSearchIndex);
536     }
537 
538     /**
539      * Resets this matcher and then attempts to find the next subsequence of
540      * the input sequence that matches the pattern, starting at the specified
541      * index.
542      *
543      * <p> If the match succeeds then more information can be obtained via the
544      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent
545      * invocations of the {@link #find()} method will start at the first
546      * character not matched by this match.  </p>
547      *
548      * @throws  IndexOutOfBoundsException
549      *          If start is less than zero or if start is greater than the
550      *          length of the input sequence.
551      *
552      * @return  <tt>true</tt> if, and only if, a subsequence of the input
553      *          sequence starting at the given index matches this matcher's
554      *          pattern
555      */
556     public boolean find(int start) {
557         int limit = getTextLength();
558         if ((start < 0) || (start > limit))
559             throw new IndexOutOfBoundsException("Illegal start index");
560         reset();
561         return search(start);
562     }
563 
564     /**
565      * Attempts to match the input sequence, starting at the beginning of the 
566      * region, against the pattern.
567      *
568      * <p> Like the {@link #matches matches} method, this method always starts
569      * at the beginning of the region; unlike that method, it does not
570      * require that the entire region be matched.
571      *
572      * <p> If the match succeeds then more information can be obtained via the
573      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
574      *
575      * @return  <tt>true</tt> if, and only if, a prefix of the input
576      *          sequence matches this matcher's pattern
577      */
578     public boolean lookingAt() {
579         return match(from, NOANCHOR);
580     }
581 
582     /**
583      * Returns a literal replacement <code>String</code> for the specified
584      * <code>String</code>.
585      *
586      * This method produces a <code>String</code> that will work
587      * as a literal replacement <code>s</code> in the
588      * <code>appendReplacement</code> method of the {@link Matcher} class.
589      * The <code>String</code> produced will match the sequence of characters
590      * in <code>s</code> treated as a literal sequence. Slashes ('\') and
591      * dollar signs ('$') will be given no special meaning.
592      *
593      * @param  s The string to be literalized
594      * @return  A literal string replacement
595      * @since 1.5
596      */
597     public static String quoteReplacement(String s) {
598         if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
599             return s;
600         StringBuffer sb = new StringBuffer();
601         for (int i=0; i<s.length(); i++) {
602             char c = s.charAt(i);
603             if (c == '\\') {
604                 sb.append('\\'); sb.append('\\');
605             } else if (c == '$') {
606                 sb.append('\\'); sb.append('$');
607             } else {
608                 sb.append(c);
609             }
610         }
611         return sb.toString();
612     }
613 
614     /**
615      * Implements a non-terminal append-and-replace step.
616      *
617      * <p> This method performs the following actions: </p>
618      *
619      * <ol>
620      *
621      *   <li><p> It reads characters from the input sequence, starting at the
622      *   append position, and appends them to the given string buffer.  It
623      *   stops after reading the last character preceding the previous match,
624      *   that is, the character at index {@link
625      *   #start()}&nbsp;<tt>-</tt>&nbsp;<tt>1</tt>.  </p></li>
626      *
627      *   <li><p> It appends the given replacement string to the string buffer.
628      *   </p></li>
629      *
630      *   <li><p> It sets the append position of this matcher to the index of
631      *   the last character matched, plus one, that is, to {@link #end()}.
632      *   </p></li>
633      *
634      * </ol>
635      *
636      * <p> The replacement string may contain references to subsequences
637      * captured during the previous match: Each occurrence of
638      * <tt>$</tt><i>g</i><tt></tt> will be replaced by the result of
639      * evaluating {@link #group(int) group}<tt>(</tt><i>g</i><tt>)</tt>. 
640      * The first number after the <tt>$</tt> is always treated as part of
641      * the group reference. Subsequent numbers are incorporated into g if
642      * they would form a legal group reference. Only the numerals '0'
643      * through '9' are considered as potential components of the group
644      * reference. If the second group matched the string <tt>"foo"</tt>, for
645      * example, then passing the replacement string <tt>"$2bar"</tt> would
646      * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar
647      * sign (<tt>$</tt>) may be included as a literal in the replacement
648      * string by preceding it with a backslash (<tt>\$</tt>).
649      *
650      * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
651      * the replacement string may cause the results to be different than if it
652      * were being treated as a literal replacement string. Dollar signs may be
653      * treated as references to captured subsequences as described above, and
654      * backslashes are used to escape literal characters in the replacement
655      * string.
656      *
657      * <p> This method is intended to be used in a loop together with the
658      * {@link #appendTail appendTail} and {@link #find find} methods.  The
659      * following code, for example, writes <tt>one dog two dogs in the
660      * yard</tt> to the standard-output stream: </p>
661      *
662      * <blockquote><pre>
663      * Pattern p = Pattern.compile("cat");
664      * Matcher m = p.matcher("one cat two cats in the yard");
665      * StringBuffer sb = new StringBuffer();
666      * while (m.find()) {
667      *     m.appendReplacement(sb, "dog");
668      * }
669      * m.appendTail(sb);
670      * System.out.println(sb.toString());</pre></blockquote>
671      *
672      * @param  sb
673      *         The target string buffer
674      *
675      * @param  replacement
676      *         The replacement string
677      *
678      * @return  This matcher
679      *
680      * @throws  IllegalStateException
681      *          If no match has yet been attempted,
682      *          or if the previous match operation failed
683      *
684      * @throws  IndexOutOfBoundsException
685      *          If the replacement string refers to a capturing group
686      *          that does not exist in the pattern
687      */
688     public Matcher appendReplacement(StringBuffer sb, String replacement) {
689 
690         // If no match, return error
691         if (first < 0)
692             throw new IllegalStateException("No match available");
693 
694         // Process substitution string to replace group references with groups
695         int cursor = 0;
696         String s = replacement;
697         StringBuffer result = new StringBuffer();
698 
699         while (cursor < replacement.length()) {
700             char nextChar = replacement.charAt(cursor);
701             if (nextChar == '\\') {
702                 cursor++;
703                 nextChar = replacement.charAt(cursor);
704                 result.append(nextChar);
705                 cursor++;
706             } else if (nextChar == '$') {
707                 // Skip past $
708                 cursor++;
709 
710                 // The first number is always a group
711                 int refNum = (int)replacement.charAt(cursor) - '0';
712                 if ((refNum < 0)||(refNum > 9))
713                     throw new IllegalArgumentException(
714                         "Illegal group reference");
715                 cursor++;
716 
717                 // Capture the largest legal group string
718                 boolean done = false;
719                 while (!done) {
720                     if (cursor >= replacement.length()) {
721                         break;
722                     }
723                     int nextDigit = replacement.charAt(cursor) - '0';
724                     if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
725                         break;
726                     }
727                     int newRefNum = (refNum * 10) + nextDigit;
728                     if (groupCount() < newRefNum) {
729                         done = true;
730                     } else {
731                         refNum = newRefNum;
732                         cursor++;
733                     }
734                 }
735 
736                 // Append group
737                 if (group(refNum) != null)
738                     result.append(group(refNum));
739             } else {
740                 result.append(nextChar);
741                 cursor++;
742             }
743         }
744 
745         // Append the intervening text
746         sb.append(getSubSequence(lastAppendPosition, first));
747         // Append the match substitution
748         sb.append(result.toString());
749 
750         lastAppendPosition = last;
751     return this;
752     }
753 
754     /**
755      * Implements a terminal append-and-replace step.
756      *
757      * <p> This method reads characters from the input sequence, starting at
758      * the append position, and appends them to the given string buffer.  It is
759      * intended to be invoked after one or more invocations of the {@link
760      * #appendReplacement appendReplacement} method in order to copy the
761      * remainder of the input sequence.  </p>
762      *
763      * @param  sb
764      *         The target string buffer
765      *
766      * @return  The target string buffer
767      */
768     public StringBuffer appendTail(StringBuffer sb) {
769         sb.append(getSubSequence(lastAppendPosition, getTextLength()).toString());
770     return sb;
771     }
772 
773     /**
774      * Replaces every subsequence of the input sequence that matches the
775      * pattern with the given replacement string.
776      *
777      * <p> This method first resets this matcher.  It then scans the input
778      * sequence looking for matches of the pattern.  Characters that are not
779      * part of any match are appended directly to the result string; each match
780      * is replaced in the result by the replacement string.  The replacement
781      * string may contain references to captured subsequences as in the {@link
782      * #appendReplacement appendReplacement} method.
783      *
784      * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
785      * the replacement string may cause the results to be different than if it
786      * were being treated as a literal replacement string. Dollar signs may be
787      * treated as references to captured subsequences as described above, and
788      * backslashes are used to escape literal characters in the replacement
789      * string.
790      *
791      * <p> Given the regular expression <tt>a*b</tt>, the input
792      * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
793      * <tt>"-"</tt>, an invocation of this method on a matcher for that
794      * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
795      *
796      * <p> Invoking this method changes this matcher's state.  If the matcher
797      * is to be used in further matching operations then it should first be
798      * reset.  </p>
799      *
800      * @param  replacement
801      *         The replacement string
802      *
803      * @return  The string constructed by replacing each matching subsequence
804      *          by the replacement string, substituting captured subsequences
805      *          as needed
806      */
807     public String replaceAll(String replacement) {
808         reset();
809         boolean result = find();
810         if (result) {
811             StringBuffer sb = new StringBuffer();
812             do {
813                 appendReplacement(sb, replacement);
814                 result = find();
815             } while (result);
816             appendTail(sb);
817             return sb.toString();
818         }
819         return text.toString();
820     }
821 
822     /**
823      * Replaces the first subsequence of the input sequence that matches the
824      * pattern with the given replacement string.
825      *
826      * <p> This method first resets this matcher.  It then scans the input
827      * sequence looking for a match of the pattern.  Characters that are not
828      * part of the match are appended directly to the result string; the match
829      * is replaced in the result by the replacement string.  The replacement
830      * string may contain references to captured subsequences as in the {@link
831      * #appendReplacement appendReplacement} method.
832      *
833      * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
834      * the replacement string may cause the results to be different than if it
835      * were being treated as a literal replacement string. Dollar signs may be
836      * treated as references to captured subsequences as described above, and
837      * backslashes are used to escape literal characters in the replacement
838      * string.
839      *
840      * <p> Given the regular expression <tt>dog</tt>, the input
841      * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
842      * <tt>"cat"</tt>, an invocation of this method on a matcher for that
843      * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>.  </p>
844      *
845      * <p> Invoking this method changes this matcher's state.  If the matcher
846      * is to be used in further matching operations then it should first be
847      * reset.  </p>
848      *
849      * @param  replacement
850      *         The replacement string
851      * @return  The string constructed by replacing the first matching
852      *          subsequence by the replacement string, substituting captured
853      *          subsequences as needed
854      */
855     public String replaceFirst(String replacement) {
856         if (replacement == null)
857             throw new NullPointerException("replacement");
858         StringBuffer sb = new StringBuffer();
859         reset();
860         if (find())
861             appendReplacement(sb, replacement);
862         appendTail(sb);
863         return sb.toString();
864     }
865 
866     /**
867      * Sets the limits of this matcher's region. The region is the part of the
868      * input sequence that will be searched to find a match. Invoking this
869      * method resets the matcher, and then sets the region to start at the
870      * index specified by the <code>start</code> parameter and end at the
871      * index specified by the <code>end</code> parameter.
872      *
873      * <p>Depending on the transparency and anchoring being used (see
874      * {@link #useTransparentBounds useTransparentBounds} and 
875      * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
876      * as anchors may behave differently at or around the boundaries of the
877      * region.
878      *
879      * @param  start
880      *         The index to start searching at (inclusive)
881      * @param  end
882      *         The index to end searching at (exclusive)
883      * @throws  IndexOutOfBoundsException
884      *          If start or end is less than zero, if
885      *          start is greater than the length of the input sequence, if
886      *          end is greater than the length of the input sequence, or if
887      *          start is greater than end.
888      * @return  this matcher
889      * @since 1.5
890      */
891     public Matcher region(int start, int end) {
892         if ((start < 0) || (start > getTextLength()))
893             throw new IndexOutOfBoundsException("start");
894         if ((end < 0) || (end > getTextLength()))
895             throw new IndexOutOfBoundsException("end");
896         if (start > end)
897             throw new IndexOutOfBoundsException("start > end");
898         reset();
899         from = start;
900         to = end;
901         return this;
902     }
903 
904     /**
905      * Reports the start index of this matcher's region. The
906      * searches this matcher conducts are limited to finding matches
907      * within {@link #regionStart regionStart} (inclusive) and
908      * {@link #regionEnd regionEnd} (exclusive).
909      *
910      * @return  The starting point of this matcher's region
911      * @since 1.5
912      */
913     public int regionStart() {
914         return from;
915     }
916 
917     /**
918      * Reports the end index (exclusive) of this matcher's region.
919      * The searches this matcher conducts are limited to finding matches
920      * within {@link #regionStart regionStart} (inclusive) and
921      * {@link #regionEnd regionEnd} (exclusive).
922      *
923      * @return  the ending point of this matcher's region
924      * @since 1.5
925      */
926     public int regionEnd() {
927         return to;
928     }
929 
930     /**
931      * Queries the transparency of region bounds for this matcher.
932      *
933      * <p> This method returns <tt>true</tt> if this matcher uses
934      * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i>
935      * bounds.
936      *
937      * <p> See {@link #useTransparentBounds useTransparentBounds} for a 
938      * description of transparent and opaque bounds.
939      *
940      * <p> By default, a matcher uses opaque region boundaries.
941      *
942      * @return <tt>true</tt> iff this matcher is using transparent bounds,
943      *         <tt>false</tt> otherwise.
944      * @see java.util.regex.Matcher#useTransparentBounds(boolean)
945      * @since 1.5
946      */
947     public boolean hasTransparentBounds() {
948         return transparentBounds;
949     }
950 
951     /**
952      * Sets the transparency of region bounds for this matcher.
953      *
954      * <p> Invoking this method with an argument of <tt>true</tt> will set this
955      * matcher to use <i>transparent</i> bounds. If the boolean 
956      * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used.
957      * 
958      * <p> Using transparent bounds, the boundaries of this 
959      * matcher's region are transparent to lookahead, lookbehind,
960      * and boundary matching constructs. Those constructs can see beyond the 
961      * boundaries of the region to see if a match is appropriate.
962      *
963      * <p> Using opaque bounds, the boundaries of this matcher's 
964      * region are opaque to lookahead, lookbehind, and boundary matching 
965      * constructs that may try to see beyond them. Those constructs cannot
966      * look past the boundaries so they will fail to match anything outside
967      * of the region.
968      *
969      * <p> By default, a matcher uses opaque bounds.
970      *
971      * @param  b a boolean indicating whether to use opaque or transparent
972      *         regions
973      * @return this matcher
974      * @see java.util.regex.Matcher#hasTransparentBounds
975      * @since 1.5
976      */
977     public Matcher useTransparentBounds(boolean b) {
978         transparentBounds = b;
979         return this;
980     }
981  
982     /**
983      * Queries the anchoring of region bounds for this matcher.
984      *
985      * <p> This method returns <tt>true</tt> if this matcher uses
986      * <i>anchoring</i> bounds, <tt>false</tt> otherwise.
987      *
988      * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a 
989      * description of anchoring bounds.
990      *
991      * <p> By default, a matcher uses anchoring region boundaries.
992      *
993      * @return <tt>true</tt> iff this matcher is using anchoring bounds,
994      *         <tt>false</tt> otherwise.
995      * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
996      * @since 1.5
997      */
998     public boolean hasAnchoringBounds() {
999         return anchoringBounds;
1000    }
1001
1002    /**
1003     * Sets the anchoring of region bounds for this matcher.
1004     *
1005     * <p> Invoking this method with an argument of <tt>true</tt> will set this
1006     * matcher to use <i>anchoring</i> bounds. If the boolean 
1007     * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be 
1008     * used.
1009     * 
1010     * <p> Using anchoring bounds, the boundaries of this 
1011     * matcher's region match anchors such as ^ and $.
1012     *
1013     * <p> Without anchoring bounds, the boundaries of this 
1014     * matcher's region will not match anchors such as ^ and $.
1015     *
1016     * <p> By default, a matcher uses anchoring region boundaries.
1017     *
1018     * @param  b a boolean indicating whether or not to use anchoring bounds.
1019     * @return this matcher
1020     * @see java.util.regex.Matcher#hasAnchoringBounds
1021     * @since 1.5
1022     */
1023    public Matcher useAnchoringBounds(boolean b) {
1024        anchoringBounds = b;
1025        return this;
1026    }
1027
1028    /**
1029     * <p>Returns the string representation of this matcher. The
1030     * string representation of a <code>Matcher</code> contains information
1031     * that may be useful for debugging. The exact format is unspecified.
1032     *
1033     * @return  The string representation of this matcher
1034     * @since 1.5
1035     */
1036    public String toString() {
1037        StringBuffer sb = new StringBuffer();
1038    sb.append("java.util.regex.Matcher");
1039    sb.append("[pattern=" + pattern());
1040    sb.append(" region=");
1041    sb.append(regionStart() + "," + regionEnd());
1042        sb.append(" lastmatch=");
1043        if ((first >= 0) && (group() != null)) {
1044            sb.append(group());
1045        }
1046    sb.append("]");
1047    return sb.toString();
1048    }
1049
1050    /**
1051     * <p>Returns true if the end of input was hit by the search engine in
1052     * the last match operation performed by this matcher.
1053     *
1054     * <p>When this method returns true, then it is possible that more input 
1055     * would have changed the result of the last search.
1056     *
1057     * @return  true iff the end of input was hit in the last match; false
1058     *          otherwise
1059     * @since 1.5
1060     */
1061    public boolean hitEnd() {
1062        return hitEnd;
1063    }
1064
1065    /**
1066     * <p>Returns true if more input could change a positive match into a 
1067     * negative one.
1068     *
1069     * <p>If this method returns true, and a match was found, then more
1070     * input could cause the match to be lost. If this method returns false 
1071     * and a match was found, then more input might change the match but the 
1072     * match won't be lost. If a match was not found, then requireEnd has no 
1073     * meaning.
1074     *
1075     * @return  true iff more input could change a positive match into a 
1076     *          negative one.
1077     * @since 1.5
1078     */
1079    public boolean requireEnd() {
1080        return requireEnd;
1081    }
1082
1083    /**
1084     * Initiates a search to find a Pattern within the given bounds.
1085     * The groups are filled with default values and the match of the root
1086     * of the state machine is called. The state machine will hold the state
1087     * of the match as it proceeds in this matcher.
1088     * 
1089     * Matcher.from is not set here, because it is the "hard" boundary
1090     * of the start of the search which anchors will set to. The from param
1091     * is the "soft" boundary of the start of the search, meaning that the
1092     * regex tries to match at that index but ^ won't match there. Subsequent
1093     * calls to the search methods start at a new "soft" boundary which is
1094     * the end of the previous match.
1095     */
1096    boolean search(int from) {
1097        this.hitEnd = false;
1098        this.requireEnd = false;
1099        from        = from < 0 ? 0 : from;
1100        this.first  = from;
1101        this.oldLast = oldLast < 0 ? from : oldLast;
1102        for (int i = 0; i < groups.length; i++)
1103            groups[i] = -1;
1104        acceptMode = NOANCHOR;
1105        boolean result = parentPattern.root.match(this, from, text);
1106        if (!result)
1107            this.first = -1;
1108        this.oldLast = this.last;
1109        return result;
1110    }
1111
1112    /**
1113     * Initiates a search for an anchored match to a Pattern within the given
1114     * bounds. The groups are filled with default values and the match of the
1115     * root of the state machine is called. The state machine will hold the
1116     * state of the match as it proceeds in this matcher.
1117     */
1118    boolean match(int from, int anchor) {
1119        this.hitEnd = false;
1120        this.requireEnd = false;
1121        from        = from < 0 ? 0 : from;
1122        this.first  = from;
1123        this.oldLast = oldLast < 0 ? from : oldLast;
1124        for (int i = 0; i < groups.length; i++)
1125            groups[i] = -1;
1126        acceptMode = anchor;
1127        boolean result = parentPattern.matchRoot.match(this, from, text);
1128        if (!result)
1129            this.first = -1;
1130        this.oldLast = this.last;
1131        return result;
1132    }
1133
1134    /**
1135     * Returns the end index of the text.
1136     *
1137     * @return the index after the last character in the text
1138     */
1139    int getTextLength() {
1140        return text.length();
1141    }
1142
1143    /**
1144     * Generates a String from this Matcher's input in the specified range.
1145     *
1146     * @param  beginIndex   the beginning index, inclusive
1147     * @param  endIndex     the ending index, exclusive
1148     * @return A String generated from this Matcher's input
1149     */
1150    CharSequence getSubSequence(int beginIndex, int endIndex) {
1151        return text.subSequence(beginIndex, endIndex);
1152    }
1153
1154    /**
1155     * Returns this Matcher's input character at index i.
1156     *
1157     * @return A char from the specified index
1158     */
1159    char charAt(int i) {
1160        return text.charAt(i);
1161    }
1162
1163}
1164