1   /*
2    * %W% %E%
3    *
4    * Copyright (c) 2006, Oracle and/or its affiliates. All rights reserved.
5    * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6    */
7   
8   package java.util.regex;
9   
10  import java.security.AccessController;
11  import java.security.PrivilegedAction;
12  import java.text.CharacterIterator;
13  import java.text.Normalizer;
14  import java.util.ArrayList;
15  import java.util.HashMap;
16  import java.util.Arrays;
17  
18  
19  /**
20   * A compiled representation of a regular expression.
21   *
22   * <p> A regular expression, specified as a string, must first be compiled into
23   * an instance of this class.  The resulting pattern can then be used to create
24   * a {@link Matcher} object that can match arbitrary {@link
25   * java.lang.CharSequence </code>character sequences<code>} against the regular
26   * expression.  All of the state involved in performing a match resides in the
27   * matcher, so many matchers can share the same pattern.
28   *
29   * <p> A typical invocation sequence is thus
30   *
31   * <blockquote><pre>
32   * Pattern p = Pattern.{@link #compile compile}("a*b");
33   * Matcher m = p.{@link #matcher matcher}("aaaaab");
34   * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
35   *
36   * <p> A {@link #matches matches} method is defined by this class as a
37   * convenience for when a regular expression is used just once.  This method
38   * compiles an expression and matches an input sequence against it in a single
39   * invocation.  The statement
40   *
41   * <blockquote><pre>
42   * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote>
43   *
44   * is equivalent to the three statements above, though for repeated matches it
45   * is less efficient since it does not allow the compiled pattern to be reused.
46   *
47   * <p> Instances of this class are immutable and are safe for use by multiple
48   * concurrent threads.  Instances of the {@link Matcher} class are not safe for
49   * such use.
50   *
51   *
52   * <a name="sum">
53   * <h4> Summary of regular-expression constructs </h4>
54   *
55   * <table border="0" cellpadding="1" cellspacing="0"
56   *  summary="Regular expression constructs, and what they match">
57   *
58   * <tr align="left">
59   * <th bgcolor="#CCCCFF" align="left" id="construct">Construct</th>
60   * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
61   * </tr>
62   *
63   * <tr><th>&nbsp;</th></tr>
64   * <tr align="left"><th colspan="2" id="characters">Characters</th></tr>
65   *
66   * <tr><td valign="top" headers="construct characters"><i>x</i></td>
67   *     <td headers="matches">The character <i>x</i></td></tr>
68   * <tr><td valign="top" headers="construct characters"><tt>\\</tt></td>
69   *     <td headers="matches">The backslash character</td></tr>
70   * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>n</i></td>
71   *     <td headers="matches">The character with octal value <tt>0</tt><i>n</i>
72   *         (0&nbsp;<tt>&lt;=</tt>&nbsp;<i>n</i>&nbsp;<tt>&lt;=</tt>&nbsp;7)</td></tr>
73   * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>nn</i></td>
74   *     <td headers="matches">The character with octal value <tt>0</tt><i>nn</i>
75   *         (0&nbsp;<tt>&lt;=</tt>&nbsp;<i>n</i>&nbsp;<tt>&lt;=</tt>&nbsp;7)</td></tr>
76   * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>mnn</i></td>
77   *     <td headers="matches">The character with octal value <tt>0</tt><i>mnn</i>
78   *         (0&nbsp;<tt>&lt;=</tt>&nbsp;<i>m</i>&nbsp;<tt>&lt;=</tt>&nbsp;3,
79   *         0&nbsp;<tt>&lt;=</tt>&nbsp;<i>n</i>&nbsp;<tt>&lt;=</tt>&nbsp;7)</td></tr>
80   * <tr><td valign="top" headers="construct characters"><tt>\x</tt><i>hh</i></td>
81   *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hh</i></td></tr>
82   * <tr><td valign="top" headers="construct characters"><tt>&#92;u</tt><i>hhhh</i></td>
83   *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hhhh</i></td></tr>
84   * <tr><td valign="top" headers="matches"><tt>\t</tt></td>
85   *     <td headers="matches">The tab character (<tt>'&#92;u0009'</tt>)</td></tr>
86   * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>
87   *     <td headers="matches">The newline (line feed) character (<tt>'&#92;u000A'</tt>)</td></tr>
88   * <tr><td valign="top" headers="construct characters"><tt>\r</tt></td>
89   *     <td headers="matches">The carriage-return character (<tt>'&#92;u000D'</tt>)</td></tr>
90   * <tr><td valign="top" headers="construct characters"><tt>\f</tt></td>
91   *     <td headers="matches">The form-feed character (<tt>'&#92;u000C'</tt>)</td></tr>
92   * <tr><td valign="top" headers="construct characters"><tt>\a</tt></td>
93   *     <td headers="matches">The alert (bell) character (<tt>'&#92;u0007'</tt>)</td></tr>
94   * <tr><td valign="top" headers="construct characters"><tt>\e</tt></td>
95   *     <td headers="matches">The escape character (<tt>'&#92;u001B'</tt>)</td></tr>
96   * <tr><td valign="top" headers="construct characters"><tt>\c</tt><i>x</i></td>
97   *     <td headers="matches">The control character corresponding to <i>x</i></td></tr>
98   *
99   * <tr><th>&nbsp;</th></tr>
100  * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
101  *
102  * <tr><td valign="top" headers="construct classes"><tt>[abc]</tt></td>
103  *     <td headers="matches"><tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (simple class)</td></tr>
104  * <tr><td valign="top" headers="construct classes"><tt>[^abc]</tt></td>
105  *     <td headers="matches">Any character except <tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (negation)</td></tr>
106  * <tr><td valign="top" headers="construct classes"><tt>[a-zA-Z]</tt></td>
107  *     <td headers="matches"><tt>a</tt> through <tt>z</tt>
108  *         or <tt>A</tt> through <tt>Z</tt>, inclusive (range)</td></tr>
109  * <tr><td valign="top" headers="construct classes"><tt>[a-d[m-p]]</tt></td>
110  *     <td headers="matches"><tt>a</tt> through <tt>d</tt>,
111  *      or <tt>m</tt> through <tt>p</tt>: <tt>[a-dm-p]</tt> (union)</td></tr>
112  * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[def]]</tt></td>
113  *     <td headers="matches"><tt>d</tt>, <tt>e</tt>, or <tt>f</tt> (intersection)</tr>
114  * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^bc]]</tt></td>
115  *     <td headers="matches"><tt>a</tt> through <tt>z</tt>,
116  *         except for <tt>b</tt> and <tt>c</tt>: <tt>[ad-z]</tt> (subtraction)</td></tr>
117  * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^m-p]]</tt></td>
118  *     <td headers="matches"><tt>a</tt> through <tt>z</tt>,
119  *          and not <tt>m</tt> through <tt>p</tt>: <tt>[a-lq-z]</tt>(subtraction)</td></tr>
120  * <tr><th>&nbsp;</th></tr>
121  *
122  * <tr align="left"><th colspan="2" id="predef">Predefined character classes</th></tr>
123  *
124  * <tr><td valign="top" headers="construct predef"><tt>.</tt></td>
125  *     <td headers="matches">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
126  * <tr><td valign="top" headers="construct predef"><tt>\d</tt></td>
127  *     <td headers="matches">A digit: <tt>[0-9]</tt></td></tr>
128  * <tr><td valign="top" headers="construct predef"><tt>\D</tt></td>
129  *     <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr>
130  * <tr><td valign="top" headers="construct predef"><tt>\s</tt></td>
131  *     <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
132  * <tr><td valign="top" headers="construct predef"><tt>\S</tt></td>
133  *     <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr>
134  * <tr><td valign="top" headers="construct predef"><tt>\w</tt></td>
135  *     <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr>
136  * <tr><td valign="top" headers="construct predef"><tt>\W</tt></td>
137  *     <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr>
138  *
139  * <tr><th>&nbsp;</th></tr>
140  * <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr>
141  *
142  * <tr><td valign="top" headers="construct posix"><tt>\p{Lower}</tt></td>
143  *     <td headers="matches">A lower-case alphabetic character: <tt>[a-z]</tt></td></tr>
144  * <tr><td valign="top" headers="construct posix"><tt>\p{Upper}</tt></td>
145  *     <td headers="matches">An upper-case alphabetic character:<tt>[A-Z]</tt></td></tr>
146  * <tr><td valign="top" headers="construct posix"><tt>\p{ASCII}</tt></td>
147  *     <td headers="matches">All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
148  * <tr><td valign="top" headers="construct posix"><tt>\p{Alpha}</tt></td>
149  *     <td headers="matches">An alphabetic character:<tt>[\p{Lower}\p{Upper}]</tt></td></tr>
150  * <tr><td valign="top" headers="construct posix"><tt>\p{Digit}</tt></td>
151  *     <td headers="matches">A decimal digit: <tt>[0-9]</tt></td></tr>
152  * <tr><td valign="top" headers="construct posix"><tt>\p{Alnum}</tt></td>
153  *     <td headers="matches">An alphanumeric character:<tt>[\p{Alpha}\p{Digit}]</tt></td></tr>
154  * <tr><td valign="top" headers="construct posix"><tt>\p{Punct}</tt></td>
155  *     <td headers="matches">Punctuation: One of <tt>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</tt></td></tr>
156  *     <!-- <tt>[\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]</tt>
157  *          <tt>[\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]</tt> -->
158  * <tr><td valign="top" headers="construct posix"><tt>\p{Graph}</tt></td>
159  *     <td headers="matches">A visible character: <tt>[\p{Alnum}\p{Punct}]</tt></td></tr>
160  * <tr><td valign="top" headers="construct posix"><tt>\p{Print}</tt></td>
161  *     <td headers="matches">A printable character: <tt>[\p{Graph}\x20]</tt></td></tr>
162  * <tr><td valign="top" headers="construct posix"><tt>\p{Blank}</tt></td>
163  *     <td headers="matches">A space or a tab: <tt>[ \t]</tt></td></tr>
164  * <tr><td valign="top" headers="construct posix"><tt>\p{Cntrl}</tt></td>
165  *     <td headers="matches">A control character: <tt>[\x00-\x1F\x7F]</tt></td></tr>
166  * <tr><td valign="top" headers="construct posix"><tt>\p{XDigit}</tt></td>
167  *     <td headers="matches">A hexadecimal digit: <tt>[0-9a-fA-F]</tt></td></tr>
168  * <tr><td valign="top" headers="construct posix"><tt>\p{Space}</tt></td>
169  *     <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
170  *
171  * <tr><th>&nbsp;</th></tr>
172  * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>
173  *
174  * <tr><td valign="top"><tt>\p{javaLowerCase}</tt></td>
175  *     <td>Equivalent to java.lang.Character.isLowerCase()</td></tr>
176  * <tr><td valign="top"><tt>\p{javaUpperCase}</tt></td>
177  *     <td>Equivalent to java.lang.Character.isUpperCase()</td></tr>
178  * <tr><td valign="top"><tt>\p{javaWhitespace}</tt></td>
179  *     <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
180  * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
181  *     <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
182  *
183  * <tr><th>&nbsp;</th></tr>
184  * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode blocks and categories</th></tr>
185  *
186  * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
187  *     <td headers="matches">A character in the Greek&nbsp;block (simple <a href="#ubc">block</a>)</td></tr>
188  * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
189  *     <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
190  * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
191  *     <td headers="matches">A currency symbol</td></tr>
192  * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
193  *     <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
194  * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]]&nbsp;</tt></td>
195  *     <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
196  *
197  * <tr><th>&nbsp;</th></tr>
198  * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
199  *
200  * <tr><td valign="top" headers="construct bounds"><tt>^</tt></td>
201  *     <td headers="matches">The beginning of a line</td></tr>
202  * <tr><td valign="top" headers="construct bounds"><tt>$</tt></td>
203  *     <td headers="matches">The end of a line</td></tr>
204  * <tr><td valign="top" headers="construct bounds"><tt>\b</tt></td>
205  *     <td headers="matches">A word boundary</td></tr>
206  * <tr><td valign="top" headers="construct bounds"><tt>\B</tt></td>
207  *     <td headers="matches">A non-word boundary</td></tr>
208  * <tr><td valign="top" headers="construct bounds"><tt>\A</tt></td>
209  *     <td headers="matches">The beginning of the input</td></tr>
210  * <tr><td valign="top" headers="construct bounds"><tt>\G</tt></td>
211  *     <td headers="matches">The end of the previous match</td></tr>
212  * <tr><td valign="top" headers="construct bounds"><tt>\Z</tt></td>
213  *     <td headers="matches">The end of the input but for the final
214  *         <a href="#lt">terminator</a>, if&nbsp;any</td></tr>
215  * <tr><td valign="top" headers="construct bounds"><tt>\z</tt></td>
216  *     <td headers="matches">The end of the input</td></tr>
217  *
218  * <tr><th>&nbsp;</th></tr>
219  * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
220  *
221  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td>
222  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
223  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>*</tt></td>
224  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
225  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>+</tt></td>
226  *     <td headers="matches"><i>X</i>, one or more times</td></tr>
227  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>}</tt></td>
228  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
229  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,}</tt></td>
230  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
231  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}</tt></td>
232  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
233  *
234  * <tr><th>&nbsp;</th></tr>
235  * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
236  *
237  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>??</tt></td>
238  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
239  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>*?</tt></td>
240  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
241  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>+?</tt></td>
242  *     <td headers="matches"><i>X</i>, one or more times</td></tr>
243  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>}?</tt></td>
244  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
245  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,}?</tt></td>
246  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
247  * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}?</tt></td>
248  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
249  *
250  * <tr><th>&nbsp;</th></tr>
251  * <tr align="left"><th colspan="2" id="poss">Possessive quantifiers</th></tr>
252  *
253  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>?+</tt></td>
254  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
255  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>*+</tt></td>
256  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
257  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>++</tt></td>
258  *     <td headers="matches"><i>X</i>, one or more times</td></tr>
259  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>}+</tt></td>
260  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
261  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,}+</tt></td>
262  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
263  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>
264  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
265  *
266  * <tr><th>&nbsp;</th></tr>
267  * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
268  *
269  * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
270  *     <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
271  * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>
272  *     <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
273  * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>
274  *     <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
275  *
276  * <tr><th>&nbsp;</th></tr>
277  * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
278  *
279  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
280  *     <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
281  *     <a href="#cg">capturing group</a> matched</td></tr>
282  *
283  * <tr><th>&nbsp;</th></tr>
284  * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
285  *
286  * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
287  *     <td headers="matches">Nothing, but quotes the following character</td></tr>
288  * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
289  *     <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
290  * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
291  *     <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
292  *     <!-- Metachars: !$()*+.<>?[\]^{|} -->
293  *
294  * <tr><th>&nbsp;</th></tr>
295  * <tr align="left"><th colspan="2" id="special">Special constructs (non-capturing)</th></tr>
296  *
297  * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
298  *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
299  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td>
300  *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
301  * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
302  * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
303  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td>
304  *     <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
305  *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
306  * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
307  * <a href="#COMMENTS">x</a> on - off</td></tr>
308  * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
309  *     <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
310  * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
311  *     <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
312  * <tr><td valign="top" headers="construct special"><tt>(?&lt;=</tt><i>X</i><tt>)</tt></td>
313  *     <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
314  * <tr><td valign="top" headers="construct special"><tt>(?&lt;!</tt><i>X</i><tt>)</tt></td>
315  *     <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
316  * <tr><td valign="top" headers="construct special"><tt>(?&gt;</tt><i>X</i><tt>)</tt></td>
317  *     <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr>
318  *
319  * </table>
320  *
321  * <hr>
322  *
323  *
324  * <a name="bs">
325  * <h4> Backslashes, escapes, and quoting </h4>
326  *
327  * <p> The backslash character (<tt>'\'</tt>) serves to introduce escaped
328  * constructs, as defined in the table above, as well as to quote characters
329  * that otherwise would be interpreted as unescaped constructs.  Thus the
330  * expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a
331  * left brace.
332  *
333  * <p> It is an error to use a backslash prior to any alphabetic character that
334  * does not denote an escaped construct; these are reserved for future
335  * extensions to the regular-expression language.  A backslash may be used
336  * prior to a non-alphabetic character regardless of whether that character is
337  * part of an unescaped construct.
338  *
339  * <p> Backslashes within string literals in Java source code are interpreted
340  * as required by the <a
341  * href="http://java.sun.com/docs/books/jls">Java Language
342  * Specification</a> as either <a
343  * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">Unicode
344  * escapes</a> or other <a
345  * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#101089">character
346  * escapes</a>.  It is therefore necessary to double backslashes in string
347  * literals that represent regular expressions to protect them from
348  * interpretation by the Java bytecode compiler.  The string literal
349  * <tt>"&#92;b"</tt>, for example, matches a single backspace character when
350  * interpreted as a regular expression, while <tt>"&#92;&#92;b"</tt> matches a
351  * word boundary.  The string literal <tt>"&#92;(hello&#92;)"</tt> is illegal
352  * and leads to a compile-time error; in order to match the string
353  * <tt>(hello)</tt> the string literal <tt>"&#92;&#92;(hello&#92;&#92;)"</tt>
354  * must be used.
355  *
356  * <a name="cc">
357  * <h4> Character Classes </h4>
358  *
359  *    <p> Character classes may appear within other character classes, and
360  *    may be composed by the union operator (implicit) and the intersection
361  *    operator (<tt>&amp;&amp;</tt>).
362  *    The union operator denotes a class that contains every character that is
363  *    in at least one of its operand classes.  The intersection operator
364  *    denotes a class that contains every character that is in both of its
365  *    operand classes.
366  *
367  *    <p> The precedence of character-class operators is as follows, from
368  *    highest to lowest:
369  *
370  *    <blockquote><table border="0" cellpadding="1" cellspacing="0"
371  *                 summary="Precedence of character class operators.">
372  *      <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>
373  *    <td>Literal escape&nbsp;&nbsp;&nbsp;&nbsp;</td>
374  *    <td><tt>\x</tt></td></tr>
375  *     <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>
376  *    <td>Grouping</td>
377  *    <td><tt>[...]</tt></td></tr>
378  *     <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>
379  *    <td>Range</td>
380  *    <td><tt>a-z</tt></td></tr>
381  *      <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>
382  *    <td>Union</td>
383  *    <td><tt>[a-e][i-u]</tt></td></tr>
384  *      <tr><th>5&nbsp;&nbsp;&nbsp;&nbsp;</th>
385  *    <td>Intersection</td>
386  *    <td><tt>[a-z&&[aeiou]]</tt></td></tr>
387  *    </table></blockquote>
388  *
389  *    <p> Note that a different set of metacharacters are in effect inside
390  *    a character class than outside a character class. For instance, the
391  *    regular expression <tt>.</tt> loses its special meaning inside a
392  *    character class, while the expression <tt>-</tt> becomes a range
393  *    forming metacharacter.
394  *
395  * <a name="lt">
396  * <h4> Line terminators </h4>
397  *
398  * <p> A <i>line terminator</i> is a one- or two-character sequence that marks
399  * the end of a line of the input character sequence.  The following are
400  * recognized as line terminators:
401  *
402  * <ul>
403  *
404  *   <li> A newline (line feed) character&nbsp;(<tt>'\n'</tt>),
405  *
406  *   <li> A carriage-return character followed immediately by a newline
407  *   character&nbsp;(<tt>"\r\n"</tt>),
408  *
409  *   <li> A standalone carriage-return character&nbsp;(<tt>'\r'</tt>),
410  *
411  *   <li> A next-line character&nbsp;(<tt>'&#92;u0085'</tt>),
412  *
413  *   <li> A line-separator character&nbsp;(<tt>'&#92;u2028'</tt>), or
414  *
415  *   <li> A paragraph-separator character&nbsp;(<tt>'&#92;u2029</tt>).
416  *
417  * </ul>
418  * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
419  * recognized are newline characters.
420  *
421  * <p> The regular expression <tt>.</tt> matches any character except a line
422  * terminator unless the {@link #DOTALL} flag is specified.
423  *
424  * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore
425  * line terminators and only match at the beginning and the end, respectively,
426  * of the entire input sequence. If {@link #MULTILINE} mode is activated then
427  * <tt>^</tt> matches at the beginning of input and after any line terminator
428  * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>
429  * matches just before a line terminator or the end of the input sequence.
430  *
431  * <a name="cg">
432  * <h4> Groups and capturing </h4>
433  *
434  * <p> Capturing groups are numbered by counting their opening parentheses from
435  * left to right.  In the expression <tt>((A)(B(C)))</tt>, for example, there
436  * are four such groups: </p>
437  *
438  * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
439  * <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>
440  *     <td><tt>((A)(B(C)))</tt></td></tr>
441  * <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>
442  *     <td><tt>(A)</tt></td></tr>
443  * <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>
444  *     <td><tt>(B(C))</tt></td></tr>
445  * <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>
446  *     <td><tt>(C)</tt></td></tr>
447  * </table></blockquote>
448  *
449  * <p> Group zero always stands for the entire expression.
450  *
451  * <p> Capturing groups are so named because, during a match, each subsequence
452  * of the input sequence that matches such a group is saved.  The captured
453  * subsequence may be used later in the expression, via a back reference, and
454  * may also be retrieved from the matcher once the match operation is complete.
455  *
456  * <p> The captured input associated with a group is always the subsequence
457  * that the group most recently matched.  If a group is evaluated a second time
458  * because of quantification then its previously-captured value, if any, will
459  * be retained if the second evaluation fails.  Matching the string
460  * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
461  * group two set to <tt>"b"</tt>.  All captured input is discarded at the
462  * beginning of each match.
463  *
464  * <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
465  * that do not capture text and do not count towards the group total.
466  *
467  *
468  * <h4> Unicode support </h4>
469  *
470  * <p> This class is in conformance with Level 1 of <a
471  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
472  * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
473  * Canonical Equivalents.
474  *
475  * <p> Unicode escape sequences such as <tt>&#92;u2014</tt> in Java source code
476  * are processed as described in <a
477  * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">ยง3.3</a>
478  * of the Java Language Specification.  Such escape sequences are also
479  * implemented directly by the regular-expression parser so that Unicode
480  * escapes can be used in expressions that are read from files or from the
481  * keyboard.  Thus the strings <tt>"&#92;u2014"</tt> and <tt>"\\u2014"</tt>,
482  * while not equal, compile into the same pattern, which matches the character
483  * with hexadecimal value <tt>0x2014</tt>.
484  *
485  * <a name="ubc"> <p>Unicode blocks and categories are written with the
486  * <tt>\p</tt> and <tt>\P</tt> constructs as in
487  * Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the
488  * property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt> does not match if
489  * the input has that property.  Blocks are specified with the prefix
490  * <tt>In</tt>, as in <tt>InMongolian</tt>.  Categories may be specified with
491  * the optional prefix <tt>Is</tt>: Both <tt>\p{L}</tt> and <tt>\p{IsL}</tt>
492  * denote the category of Unicode letters.  Blocks and categories can be used
493  * both inside and outside of a character class.
494  *
495  * <p> The supported categories are those of
496  * <a href="http://www.unicode.org/unicode/standard/standard.html">
497  * <i>The Unicode Standard</i></a> in the version specified by the
498  * {@link java.lang.Character Character} class. The category names are those
499  * defined in the Standard, both normative and informative.
500  * The block names supported by <code>Pattern</code> are the valid block names
501  * accepted and defined by
502  * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
503  *
504  * <a name="jcc"> <p>Categories that behave like the java.lang.Character
505  * boolean is<i>methodname</i> methods (except for the deprecated ones) are
506  * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
507  * the specified property has the name <tt>java<i>methodname</i></tt>.
508  *
509  * <h4> Comparison to Perl 5 </h4>
510  *
511  * <p>The <code>Pattern</code> engine performs traditional NFA-based matching
512  * with ordered alternation as occurs in Perl 5.
513  *
514  * <p> Perl constructs not supported by this class: </p>
515  *
516  * <ul>
517  *
518  *    <li><p> The conditional constructs <tt>(?{</tt><i>X</i><tt>})</tt> and
519  *    <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
520  *    </p></li>
521  *
522  *    <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
523  *    and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
524  *
525  *    <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li>
526  *
527  *    <li><p> The preprocessing operations <tt>\l</tt> <tt>&#92;u</tt>,
528  *    <tt>\L</tt>, and <tt>\U</tt>.  </p></li>
529  *
530  * </ul>
531  *
532  * <p> Constructs supported by this class but not by Perl: </p>
533  *
534  * <ul>
535  *
536  *    <li><p> Possessive quantifiers, which greedily match as much as they can
537  *    and do not back off, even when doing so would allow the overall match to
538  *    succeed.  </p></li>
539  *
540  *    <li><p> Character-class union and intersection as described
541  *    <a href="#cc">above</a>.</p></li>
542  *
543  * </ul>
544  *
545  * <p> Notable differences from Perl: </p>
546  *
547  * <ul>
548  *
549  *    <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted
550  *    as back references; a backslash-escaped number greater than <tt>9</tt> is
551  *    treated as a back reference if at least that many subexpressions exist,
552  *    otherwise it is interpreted, if possible, as an octal escape.  In this
553  *    class octal escapes must always begin with a zero. In this class,
554  *    <tt>\1</tt> through <tt>\9</tt> are always interpreted as back
555  *    references, and a larger number is accepted as a back reference if at
556  *    least that many subexpressions exist at that point in the regular
557  *    expression, otherwise the parser will drop digits until the number is
558  *    smaller or equal to the existing number of groups or it is one digit.
559  *    </p></li>
560  *
561  *    <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes
562  *    where the last match left off.  This functionality is provided implicitly
563  *    by the {@link Matcher} class: Repeated invocations of the {@link
564  *    Matcher#find find} method will resume where the last match left off,
565  *    unless the matcher is reset.  </p></li>
566  *
567  *    <li><p> In Perl, embedded flags at the top level of an expression affect
568  *    the whole expression.  In this class, embedded flags always take effect
569  *    at the point at which they appear, whether they are at the top level or
570  *    within a group; in the latter case, flags are restored at the end of the
571  *    group just as in Perl.  </p></li>
572  *
573  *    <li><p> Perl is forgiving about malformed matching constructs, as in the
574  *    expression <tt>*a</tt>, as well as dangling brackets, as in the
575  *    expression <tt>abc]</tt>, and treats them as literals.  This
576  *    class also accepts dangling brackets but is strict about dangling
577  *    metacharacters like +, ? and *, and will throw a
578  *    {@link PatternSyntaxException} if it encounters them. </p></li>
579  *
580  * </ul>
581  *
582  *
583  * <p> For a more precise description of the behavior of regular expression
584  * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/">
585  * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl,
586  * O'Reilly and Associates, 2006.</a>
587  * </p>
588  *
589  * @see java.lang.String#split(String, int)
590  * @see java.lang.String#split(String)
591  *
592  * @author      Mike McCloskey
593  * @author      Mark Reinhold
594  * @author  JSR-51 Expert Group
595  * @version     %I%, %E%
596  * @since       1.4
597  * @spec    JSR-51
598  */
599 
600 public final class Pattern
601     implements java.io.Serializable
602 {
603 
604     /**
605      * Regular expression modifier values.  Instead of being passed as
606      * arguments, they can also be passed as inline modifiers.
607      * For example, the following statements have the same effect.
608      * <pre>
609      * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
610      * RegExp r2 = RegExp.compile("(?im)abc", 0);
611      * </pre>
612      *
613      * The flags are duplicated so that the familiar Perl match flag
614      * names are available.
615      */
616 
617     /**
618      * Enables Unix lines mode.
619      *
620      * <p> In this mode, only the <tt>'\n'</tt> line terminator is recognized
621      * in the behavior of <tt>.</tt>, <tt>^</tt>, and <tt>$</tt>.
622      *
623      * <p> Unix lines mode can also be enabled via the embedded flag
624      * expression&nbsp;<tt>(?d)</tt>.
625      */
626     public static final int UNIX_LINES = 0x01;
627 
628     /**
629      * Enables case-insensitive matching.
630      *
631      * <p> By default, case-insensitive matching assumes that only characters
632      * in the US-ASCII charset are being matched.  Unicode-aware
633      * case-insensitive matching can be enabled by specifying the {@link
634      * #UNICODE_CASE} flag in conjunction with this flag.
635      *
636      * <p> Case-insensitive matching can also be enabled via the embedded flag
637      * expression&nbsp;<tt>(?i)</tt>.
638      *
639      * <p> Specifying this flag may impose a slight performance penalty.  </p>
640      */
641     public static final int CASE_INSENSITIVE = 0x02;
642 
643     /**
644      * Permits whitespace and comments in pattern.
645      *
646      * <p> In this mode, whitespace is ignored, and embedded comments starting
647      * with <tt>#</tt> are ignored until the end of a line.
648      *
649      * <p> Comments mode can also be enabled via the embedded flag
650      * expression&nbsp;<tt>(?x)</tt>.
651      */
652     public static final int COMMENTS = 0x04;
653 
654     /**
655      * Enables multiline mode.
656      *
657      * <p> In multiline mode the expressions <tt>^</tt> and <tt>$</tt> match
658      * just after or just before, respectively, a line terminator or the end of
659      * the input sequence.  By default these expressions only match at the
660      * beginning and the end of the entire input sequence.
661      *
662      * <p> Multiline mode can also be enabled via the embedded flag
663      * expression&nbsp;<tt>(?m)</tt>.  </p>
664      */
665     public static final int MULTILINE = 0x08;
666 
667     /**
668      * Enables literal parsing of the pattern.
669      *
670      * <p> When this flag is specified then the input string that specifies
671      * the pattern is treated as a sequence of literal characters.
672      * Metacharacters or escape sequences in the input sequence will be
673      * given no special meaning.
674      *
675      * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
676      * matching when used in conjunction with this flag. The other flags
677      * become superfluous.
678      *
679      * <p> There is no embedded flag character for enabling literal parsing.
680      * @since 1.5
681      */
682     public static final int LITERAL = 0x10;
683 
684     /**
685      * Enables dotall mode.
686      *
687      * <p> In dotall mode, the expression <tt>.</tt> matches any character,
688      * including a line terminator.  By default this expression does not match
689      * line terminators.
690      *
691      * <p> Dotall mode can also be enabled via the embedded flag
692      * expression&nbsp;<tt>(?s)</tt>.  (The <tt>s</tt> is a mnemonic for
693      * "single-line" mode, which is what this is called in Perl.)  </p>
694      */
695     public static final int DOTALL = 0x20;
696 
697     /**
698      * Enables Unicode-aware case folding.
699      *
700      * <p> When this flag is specified then case-insensitive matching, when
701      * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
702      * consistent with the Unicode Standard.  By default, case-insensitive
703      * matching assumes that only characters in the US-ASCII charset are being
704      * matched.
705      *
706      * <p> Unicode-aware case folding can also be enabled via the embedded flag
707      * expression&nbsp;<tt>(?u)</tt>.
708      *
709      * <p> Specifying this flag may impose a performance penalty.  </p>
710      */
711     public static final int UNICODE_CASE = 0x40;
712 
713     /**
714      * Enables canonical equivalence.
715      *
716      * <p> When this flag is specified then two characters will be considered
717      * to match if, and only if, their full canonical decompositions match.
718      * The expression <tt>"a&#92;u030A"</tt>, for example, will match the
719      * string <tt>"&#92;u00E5"</tt> when this flag is specified.  By default,
720      * matching does not take canonical equivalence into account.
721      *
722      * <p> There is no embedded flag character for enabling canonical
723      * equivalence.
724      *
725      * <p> Specifying this flag may impose a performance penalty.  </p>
726      */
727     public static final int CANON_EQ = 0x80;
728 
729     /* Pattern has only two serialized components: The pattern string
730      * and the flags, which are all that is needed to recompile the pattern
731      * when it is deserialized.
732      */
733 
734     /** use serialVersionUID from Merlin b59 for interoperability */
735     private static final long serialVersionUID = 5073258162644648461L;
736 
737     /**
738      * The original regular-expression pattern string.
739      *
740      * @serial
741      */
742     private String pattern;
743 
744     /**
745      * The original pattern flags.
746      *
747      * @serial
748      */
749     private int flags;
750 
751     /**
752      * Boolean indicating this Pattern is compiled; this is necessary in order
753      * to lazily compile deserialized Patterns.
754      */
755     private transient volatile boolean compiled = false;
756 
757     /**
758      * The normalized pattern string.
759      */
760     private transient String normalizedPattern;
761 
762     /**
763      * The starting point of state machine for the find operation.  This allows
764      * a match to start anywhere in the input.
765      */
766     transient Node root;
767 
768     /**
769      * The root of object tree for a match operation.  The pattern is matched
770      * at the beginning.  This may include a find that uses BnM or a First
771      * node.
772      */
773     transient Node matchRoot;
774 
775     /**
776      * Temporary storage used by parsing pattern slice.
777      */
778     transient int[] buffer;
779 
780     /**
781      * Temporary storage used while parsing group references.
782      */
783     transient GroupHead[] groupNodes;
784 
785     /**
786      * Temporary null terminated code point array used by pattern compiling.
787      */
788     private transient int[] temp;
789 
790     /**
791      * The number of capturing groups in this Pattern. Used by matchers to
792      * allocate storage needed to perform a match.
793      */
794     transient int capturingGroupCount;
795 
796     /**
797      * The local variable count used by parsing tree. Used by matchers to
798      * allocate storage needed to perform a match.
799      */
800     transient int localCount;
801 
802     /**
803      * Index into the pattern string that keeps track of how much has been
804      * parsed.
805      */
806     private transient int cursor;
807 
808     /**
809      * Holds the length of the pattern string.
810      */
811     private transient int patternLength;
812 
813     /**
814      * Compiles the given regular expression into a pattern.  </p>
815      *
816      * @param  regex
817      *         The expression to be compiled
818      *
819      * @throws  PatternSyntaxException
820      *          If the expression's syntax is invalid
821      */
822     public static Pattern compile(String regex) {
823         return new Pattern(regex, 0);
824     }
825 
826     /**
827      * Compiles the given regular expression into a pattern with the given
828      * flags.  </p>
829      *
830      * @param  regex
831      *         The expression to be compiled
832      *
833      * @param  flags
834      *         Match flags, a bit mask that may include
835      *         {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
836      *         {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
837      *         {@link #LITERAL} and {@link #COMMENTS}
838      *
839      * @throws  IllegalArgumentException
840      *          If bit values other than those corresponding to the defined
841      *          match flags are set in <tt>flags</tt>
842      *
843      * @throws  PatternSyntaxException
844      *          If the expression's syntax is invalid
845      */
846     public static Pattern compile(String regex, int flags) {
847         return new Pattern(regex, flags);
848     }
849 
850     /**
851      * Returns the regular expression from which this pattern was compiled.
852      * </p>
853      *
854      * @return  The source of this pattern
855      */
856     public String pattern() {
857         return pattern;
858     }
859 
860     /**
861      * <p>Returns the string representation of this pattern. This
862      * is the regular expression from which this pattern was
863      * compiled.</p>
864      *
865      * @return  The string representation of this pattern
866      * @since 1.5
867      */
868     public String toString() {
869         return pattern;
870     }
871 
872     /**
873      * Creates a matcher that will match the given input against this pattern.
874      * </p>
875      *
876      * @param  input
877      *         The character sequence to be matched
878      *
879      * @return  A new matcher for this pattern
880      */
881     public Matcher matcher(CharSequence input) {
882     if (!compiled) {
883         synchronized(this) {
884         if (!compiled)
885             compile();
886         }
887     }
888         Matcher m = new Matcher(this, input);
889         return m;
890     }
891 
892     /**
893      * Returns this pattern's match flags.  </p>
894      *
895      * @return  The match flags specified when this pattern was compiled
896      */
897     public int flags() {
898         return flags;
899     }
900 
901     /**
902      * Compiles the given regular expression and attempts to match the given
903      * input against it.
904      *
905      * <p> An invocation of this convenience method of the form
906      *
907      * <blockquote><pre>
908      * Pattern.matches(regex, input);</pre></blockquote>
909      *
910      * behaves in exactly the same way as the expression
911      *
912      * <blockquote><pre>
913      * Pattern.compile(regex).matcher(input).matches()</pre></blockquote>
914      *
915      * <p> If a pattern is to be used multiple times, compiling it once and reusing
916      * it will be more efficient than invoking this method each time.  </p>
917      *
918      * @param  regex
919      *         The expression to be compiled
920      *
921      * @param  input
922      *         The character sequence to be matched
923      *
924      * @throws  PatternSyntaxException
925      *          If the expression's syntax is invalid
926      */
927     public static boolean matches(String regex, CharSequence input) {
928         Pattern p = Pattern.compile(regex);
929         Matcher m = p.matcher(input);
930         return m.matches();
931     }
932 
933     /**
934      * Splits the given input sequence around matches of this pattern.
935      *
936      * <p> The array returned by this method contains each substring of the
937      * input sequence that is terminated by another subsequence that matches
938      * this pattern or is terminated by the end of the input sequence.  The
939      * substrings in the array are in the order in which they occur in the
940      * input.  If this pattern does not match any subsequence of the input then
941      * the resulting array has just one element, namely the input sequence in
942      * string form.
943      *
944      * <p> The <tt>limit</tt> parameter controls the number of times the
945      * pattern is applied and therefore affects the length of the resulting
946      * array.  If the limit <i>n</i> is greater than zero then the pattern
947      * will be applied at most <i>n</i>&nbsp;-&nbsp;1 times, the array's
948      * length will be no greater than <i>n</i>, and the array's last entry
949      * will contain all input beyond the last matched delimiter.  If <i>n</i>
950      * is non-positive then the pattern will be applied as many times as
951      * possible and the array can have any length.  If <i>n</i> is zero then
952      * the pattern will be applied as many times as possible, the array can
953      * have any length, and trailing empty strings will be discarded.
954      *
955      * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
956      * results with these parameters:
957      *
958      * <blockquote><table cellpadding=1 cellspacing=0
959      *              summary="Split examples showing regex, limit, and result">
960      * <tr><th><P align="left"><i>Regex&nbsp;&nbsp;&nbsp;&nbsp;</i></th>
961      *     <th><P align="left"><i>Limit&nbsp;&nbsp;&nbsp;&nbsp;</i></th>
962      *     <th><P align="left"><i>Result&nbsp;&nbsp;&nbsp;&nbsp;</i></th></tr>
963      * <tr><td align=center>:</td>
964      *     <td align=center>2</td>
965      *     <td><tt>{ "boo", "and:foo" }</tt></td></tr>
966      * <tr><td align=center>:</td>
967      *     <td align=center>5</td>
968      *     <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
969      * <tr><td align=center>:</td>
970      *     <td align=center>-2</td>
971      *     <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
972      * <tr><td align=center>o</td>
973      *     <td align=center>5</td>
974      *     <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
975      * <tr><td align=center>o</td>
976      *     <td align=center>-2</td>
977      *     <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
978      * <tr><td align=center>o</td>
979      *     <td align=center>0</td>
980      *     <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
981      * </table></blockquote>
982      *
983      *
984      * @param  input
985      *         The character sequence to be split
986      *
987      * @param  limit
988      *         The result threshold, as described above
989      *
990      * @return  The array of strings computed by splitting the input
991      *          around matches of this pattern
992      */
993     public String[] split(CharSequence input, int limit) {
994         int index = 0;
995         boolean matchLimited = limit > 0;
996         ArrayList<String> matchList = new ArrayList<String>();
997         Matcher m = matcher(input);
998 
999         // Add segments before each match found
1000        while(m.find()) {
1001            if (!matchLimited || matchList.size() < limit - 1) {
1002                String match = input.subSequence(index, m.start()).toString();
1003                matchList.add(match);
1004                index = m.end();
1005            } else if (matchList.size() == limit - 1) { // last one
1006                String match = input.subSequence(index,
1007                                                 input.length()).toString();
1008                matchList.add(match);
1009                index = m.end();
1010            }
1011        }
1012
1013        // If no match was found, return this
1014        if (index == 0)
1015            return new String[] {input.toString()};
1016
1017        // Add remaining segment
1018        if (!matchLimited || matchList.size() < limit)
1019            matchList.add(input.subSequence(index, input.length()).toString());
1020
1021        // Construct result
1022        int resultSize = matchList.size();
1023        if (limit == 0)
1024            while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
1025                resultSize--;
1026        String[] result = new String[resultSize];
1027        return matchList.subList(0, resultSize).toArray(result);
1028    }
1029
1030    /**
1031     * Splits the given input sequence around matches of this pattern.
1032     *
1033     * <p> This method works as if by invoking the two-argument {@link
1034     * #split(java.lang.CharSequence, int) split} method with the given input
1035     * sequence and a limit argument of zero.  Trailing empty strings are
1036     * therefore not included in the resulting array. </p>
1037     *
1038     * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
1039     * results with these expressions:
1040     *
1041     * <blockquote><table cellpadding=1 cellspacing=0
1042     *              summary="Split examples showing regex and result">
1043     * <tr><th><P align="left"><i>Regex&nbsp;&nbsp;&nbsp;&nbsp;</i></th>
1044     *     <th><P align="left"><i>Result</i></th></tr>
1045     * <tr><td align=center>:</td>
1046     *     <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
1047     * <tr><td align=center>o</td>
1048     *     <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
1049     * </table></blockquote>
1050     *
1051     *
1052     * @param  input
1053     *         The character sequence to be split
1054     *
1055     * @return  The array of strings computed by splitting the input
1056     *          around matches of this pattern
1057     */
1058    public String[] split(CharSequence input) {
1059        return split(input, 0);
1060    }
1061
1062    /**
1063     * Returns a literal pattern <code>String</code> for the specified
1064     * <code>String</code>.
1065     *
1066     * <p>This method produces a <code>String</code> that can be used to
1067     * create a <code>Pattern</code> that would match the string
1068     * <code>s</code> as if it were a literal pattern.</p> Metacharacters
1069     * or escape sequences in the input sequence will be given no special
1070     * meaning.
1071     *
1072     * @param  s The string to be literalized
1073     * @return  A literal string replacement
1074     * @since 1.5
1075     */
1076    public static String quote(String s) {
1077        int slashEIndex = s.indexOf("\\E");
1078        if (slashEIndex == -1)
1079            return "\\Q" + s + "\\E";
1080
1081        StringBuilder sb = new StringBuilder(s.length() * 2);
1082        sb.append("\\Q");
1083        slashEIndex = 0;
1084        int current = 0;
1085        while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
1086            sb.append(s.substring(current, slashEIndex));
1087            current = slashEIndex + 2;
1088            sb.append("\\E\\\\E\\Q");
1089        }
1090        sb.append(s.substring(current, s.length()));
1091        sb.append("\\E");
1092        return sb.toString();
1093    }
1094
1095    /**
1096     * Recompile the Pattern instance from a stream.  The original pattern
1097     * string is read in and the object tree is recompiled from it.
1098     */
1099    private void readObject(java.io.ObjectInputStream s)
1100        throws java.io.IOException, ClassNotFoundException {
1101
1102        // Read in all fields
1103    s.defaultReadObject();
1104
1105        // Initialize counts
1106        capturingGroupCount = 1;
1107        localCount = 0;
1108
1109        // if length > 0, the Pattern is lazily compiled
1110        compiled = false;
1111        if (pattern.length() == 0) {
1112            root = new Start(lastAccept);
1113            matchRoot = lastAccept;
1114            compiled = true;
1115        }
1116    }
1117
1118    /**
1119     * This private constructor is used to create all Patterns. The pattern
1120     * string and match flags are all that is needed to completely describe
1121     * a Pattern. An empty pattern string results in an object tree with
1122     * only a Start node and a LastNode node.
1123     */
1124    private Pattern(String p, int f) {
1125        pattern = p;
1126        flags = f;
1127
1128        // Reset group index count
1129        capturingGroupCount = 1;
1130        localCount = 0;
1131
1132        if (pattern.length() > 0) {
1133            compile();
1134        } else {
1135            root = new Start(lastAccept);
1136            matchRoot = lastAccept;
1137        }
1138    }
1139
1140    /**
1141     * The pattern is converted to normalizedD form and then a pure group
1142     * is constructed to match canonical equivalences of the characters.
1143     */
1144    private void normalize() {
1145        boolean inCharClass = false;
1146        int lastCodePoint = -1;
1147
1148        // Convert pattern into normalizedD form
1149        normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
1150        patternLength = normalizedPattern.length();
1151
1152        // Modify pattern to match canonical equivalences
1153        StringBuilder newPattern = new StringBuilder(patternLength);
1154        for(int i=0; i<patternLength; ) {
1155            int c = normalizedPattern.codePointAt(i);
1156            StringBuilder sequenceBuffer;
1157            if ((Character.getType(c) == Character.NON_SPACING_MARK)
1158                && (lastCodePoint != -1)) {
1159                sequenceBuffer = new StringBuilder();
1160                sequenceBuffer.appendCodePoint(lastCodePoint);
1161                sequenceBuffer.appendCodePoint(c);
1162                while(Character.getType(c) == Character.NON_SPACING_MARK) {
1163                    i += Character.charCount(c);
1164                    if (i >= patternLength)
1165                        break;
1166                    c = normalizedPattern.codePointAt(i);
1167                    sequenceBuffer.appendCodePoint(c);
1168                }
1169                String ea = produceEquivalentAlternation(
1170                                               sequenceBuffer.toString());
1171                newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
1172                newPattern.append("(?:").append(ea).append(")");
1173            } else if (c == '[' && lastCodePoint != '\\') {
1174                i = normalizeCharClass(newPattern, i);
1175            } else {
1176                newPattern.appendCodePoint(c);
1177            }
1178            lastCodePoint = c;
1179        i += Character.charCount(c);
1180        }
1181        normalizedPattern = newPattern.toString();
1182    }
1183
1184    /**
1185     * Complete the character class being parsed and add a set
1186     * of alternations to it that will match the canonical equivalences
1187     * of the characters within the class.
1188     */
1189    private int normalizeCharClass(StringBuilder newPattern, int i) {
1190        StringBuilder charClass = new StringBuilder();
1191        StringBuilder eq = null;
1192        int lastCodePoint = -1;
1193        String result;
1194
1195        i++;
1196        charClass.append("[");
1197        while(true) {
1198            int c = normalizedPattern.codePointAt(i);
1199            StringBuilder sequenceBuffer;
1200
1201            if (c == ']' && lastCodePoint != '\\') {
1202                charClass.append((char)c);
1203                break;
1204            } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
1205                sequenceBuffer = new StringBuilder();
1206                sequenceBuffer.appendCodePoint(lastCodePoint);
1207                while(Character.getType(c) == Character.NON_SPACING_MARK) {
1208                    sequenceBuffer.appendCodePoint(c);
1209                    i += Character.charCount(c);
1210                    if (i >= normalizedPattern.length())
1211                        break;
1212                    c = normalizedPattern.codePointAt(i);
1213                }
1214                String ea = produceEquivalentAlternation(
1215                                                  sequenceBuffer.toString());
1216
1217                charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
1218                if (eq == null)
1219                    eq = new StringBuilder();
1220                eq.append('|');
1221                eq.append(ea);
1222            } else {
1223                charClass.appendCodePoint(c);
1224                i++;
1225            }
1226            if (i == normalizedPattern.length())
1227                throw error("Unclosed character class");
1228            lastCodePoint = c;
1229        }
1230
1231        if (eq != null) {
1232            result = "(?:"+charClass.toString()+eq.toString()+")";
1233        } else {
1234            result = charClass.toString();
1235        }
1236
1237        newPattern.append(result);
1238        return i;
1239    }
1240
1241    /**
1242     * Given a specific sequence composed of a regular character and
1243     * combining marks that follow it, produce the alternation that will
1244     * match all canonical equivalences of that sequence.
1245     */
1246    private String produceEquivalentAlternation(String source) {
1247    int len = countChars(source, 0, 1);
1248        if (source.length() == len)
1249        // source has one character.
1250            return source;
1251
1252        String base = source.substring(0,len);
1253        String combiningMarks = source.substring(len);
1254
1255        String[] perms = producePermutations(combiningMarks);
1256        StringBuilder result = new StringBuilder(source);
1257
1258        // Add combined permutations
1259        for(int x=0; x<perms.length; x++) {
1260            String next = base + perms[x];
1261            if (x>0)
1262                result.append("|"+next);
1263            next = composeOneStep(next);
1264            if (next != null)
1265                result.append("|"+produceEquivalentAlternation(next));
1266        }
1267        return result.toString();
1268    }
1269
1270    /**
1271     * Returns an array of strings that have all the possible
1272     * permutations of the characters in the input string.
1273     * This is used to get a list of all possible orderings
1274     * of a set of combining marks. Note that some of the permutations
1275     * are invalid because of combining class collisions, and these
1276     * possibilities must be removed because they are not canonically
1277     * equivalent.
1278     */
1279    private String[] producePermutations(String input) {
1280        if (input.length() == countChars(input, 0, 1))
1281            return new String[] {input};
1282
1283        if (input.length() == countChars(input, 0, 2)) {
1284        int c0 = Character.codePointAt(input, 0);
1285        int c1 = Character.codePointAt(input, Character.charCount(c0));
1286            if (getClass(c1) == getClass(c0)) {
1287                return new String[] {input};
1288            }
1289            String[] result = new String[2];
1290            result[0] = input;
1291            StringBuilder sb = new StringBuilder(2);
1292        sb.appendCodePoint(c1);
1293        sb.appendCodePoint(c0);
1294            result[1] = sb.toString();
1295            return result;
1296        }
1297
1298        int length = 1;
1299    int nCodePoints = countCodePoints(input);
1300        for(int x=1; x<nCodePoints; x++)
1301            length = length * (x+1);
1302
1303        String[] temp = new String[length];
1304
1305        int combClass[] = new int[nCodePoints];
1306        for(int x=0, i=0; x<nCodePoints; x++) {
1307        int c = Character.codePointAt(input, i);
1308            combClass[x] = getClass(c);
1309        i +=  Character.charCount(c);
1310    }
1311
1312        // For each char, take it out and add the permutations
1313        // of the remaining chars
1314        int index = 0;
1315    int len;
1316    // offset maintains the index in code units.
1317loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
1318        len = countChars(input, offset, 1);
1319            boolean skip = false;
1320            for(int y=x-1; y>=0; y--) {
1321                if (combClass[y] == combClass[x]) {
1322                    continue loop;
1323                }
1324            }
1325            StringBuilder sb = new StringBuilder(input);
1326            String otherChars = sb.delete(offset, offset+len).toString();
1327            String[] subResult = producePermutations(otherChars);
1328
1329            String prefix = input.substring(offset, offset+len);
1330            for(int y=0; y<subResult.length; y++)
1331                temp[index++] =  prefix + subResult[y];
1332        }
1333        String[] result = new String[index];
1334        for (int x=0; x<index; x++)
1335            result[x] = temp[x];
1336        return result;
1337    }
1338
1339    private int getClass(int c) {
1340        return sun.text.Normalizer.getCombiningClass(c);
1341    }
1342
1343    /**
1344     * Attempts to compose input by combining the first character
1345     * with the first combining mark following it. Returns a String
1346     * that is the composition of the leading character with its first
1347     * combining mark followed by the remaining combining marks. Returns
1348     * null if the first two characters cannot be further composed.
1349     */
1350    private String composeOneStep(String input) {
1351    int len = countChars(input, 0, 2);
1352        String firstTwoCharacters = input.substring(0, len);
1353        String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
1354
1355        if (result.equals(firstTwoCharacters))
1356            return null;
1357        else {
1358            String remainder = input.substring(len);
1359            return result + remainder;
1360        }
1361    }
1362
1363    /**
1364     * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
1365     * See the description of `quotemeta' in perlfunc(1).
1366     */
1367    private void RemoveQEQuoting() {
1368    final int pLen = patternLength;
1369    int i = 0;
1370    while (i < pLen-1) {
1371        if (temp[i] != '\\')
1372        i += 1;
1373        else if (temp[i + 1] != 'Q')
1374        i += 2;
1375        else
1376        break;
1377    }
1378    if (i >= pLen - 1)    // No \Q sequence found
1379        return;
1380    int j = i;
1381    i += 2;
1382    int[] newtemp = new int[j + 2*(pLen-i) + 2];
1383    System.arraycopy(temp, 0, newtemp, 0, j);
1384
1385    boolean inQuote = true;
1386    while (i < pLen) {
1387        int c = temp[i++];
1388        if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) {
1389        newtemp[j++] = c;
1390        } else if (c != '\\') {
1391        if (inQuote) newtemp[j++] = '\\';
1392        newtemp[j++] = c;
1393        } else if (inQuote) {
1394        if (temp[i] == 'E') {
1395            i++;
1396            inQuote = false;
1397        } else {
1398            newtemp[j++] = '\\';
1399            newtemp[j++] = '\\';
1400        }
1401        } else {
1402        if (temp[i] == 'Q') {
1403            i++;
1404            inQuote = true;
1405        } else {
1406            newtemp[j++] = c;
1407            if (i != pLen)
1408            newtemp[j++] = temp[i++];
1409        }
1410        }
1411    }
1412
1413    patternLength = j;
1414    temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
1415    }
1416
1417    /**
1418     * Copies regular expression to an int array and invokes the parsing
1419     * of the expression which will create the object tree.
1420     */
1421    private void compile() {
1422        // Handle canonical equivalences
1423        if (has(CANON_EQ) && !has(LITERAL)) {
1424            normalize();
1425        } else {
1426            normalizedPattern = pattern;
1427        }
1428        patternLength = normalizedPattern.length();
1429
1430        // Copy pattern to int array for convenience
1431        // Use double zero to terminate pattern
1432        temp = new int[patternLength + 2];
1433
1434    boolean hasSupplementary = false;
1435    int c, count = 0;
1436    // Convert all chars into code points
1437    for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1438        c = normalizedPattern.codePointAt(x);
1439        if (isSupplementary(c)) {
1440        hasSupplementary = true;
1441        }
1442        temp[count++] = c;
1443    }
1444
1445    patternLength = count;   // patternLength now in code points
1446
1447    if (! has(LITERAL))
1448        RemoveQEQuoting();
1449
1450        // Allocate all temporary objects here.
1451        buffer = new int[32];
1452        groupNodes = new GroupHead[10];
1453
1454        if (has(LITERAL)) {
1455            // Literal pattern handling
1456            matchRoot = newSlice(temp, patternLength, hasSupplementary);
1457            matchRoot.next = lastAccept;
1458        } else {
1459            // Start recursive descent parsing
1460            matchRoot = expr(lastAccept);
1461            // Check extra pattern characters
1462            if (patternLength != cursor) {
1463                if (peek() == ')') {
1464                    throw error("Unmatched closing ')'");
1465                } else {
1466                    throw error("Unexpected internal error");
1467                }
1468            }
1469        }
1470
1471        // Peephole optimization
1472        if (matchRoot instanceof Slice) {
1473            root = BnM.optimize(matchRoot);
1474            if (root == matchRoot) {
1475                root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1476            }
1477        } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
1478            root = matchRoot;
1479        } else {
1480            root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1481        }
1482
1483        // Release temporary storage
1484        temp = null;
1485        buffer = null;
1486        groupNodes = null;
1487        patternLength = 0;
1488        compiled = true;
1489    }
1490
1491    /**
1492     * Used to print out a subtree of the Pattern to help with debugging.
1493     */
1494    private static void printObjectTree(Node node) {
1495        while(node != null) {
1496            if (node instanceof Prolog) {
1497                System.out.println(node);
1498                printObjectTree(((Prolog)node).loop);
1499                System.out.println("**** end contents prolog loop");
1500            } else if (node instanceof Loop) {
1501                System.out.println(node);
1502                printObjectTree(((Loop)node).body);
1503                System.out.println("**** end contents Loop body");
1504            } else if (node instanceof Curly) {
1505                System.out.println(node);
1506                printObjectTree(((Curly)node).atom);
1507                System.out.println("**** end contents Curly body");
1508            } else if (node instanceof GroupCurly) {
1509                System.out.println(node);
1510                printObjectTree(((GroupCurly)node).atom);
1511                System.out.println("**** end contents GroupCurly body");
1512            } else if (node instanceof GroupTail) {
1513                System.out.println(node);
1514                System.out.println("Tail next is "+node.next);
1515                return;
1516            } else {
1517                System.out.println(node);
1518            }
1519            node = node.next;
1520            if (node != null)
1521                System.out.println("->next:");
1522            if (node == Pattern.accept) {
1523                System.out.println("Accept Node");
1524                node = null;
1525            }
1526       }
1527    }
1528
1529    /**
1530     * Used to accumulate information about a subtree of the object graph
1531     * so that optimizations can be applied to the subtree.
1532     */
1533    static final class TreeInfo {
1534        int minLength;
1535        int maxLength;
1536        boolean maxValid;
1537        boolean deterministic;
1538
1539        TreeInfo() {
1540            reset();
1541        }
1542        void reset() {
1543            minLength = 0;
1544            maxLength = 0;
1545            maxValid = true;
1546            deterministic = true;
1547        }
1548    }
1549
1550    /*
1551     * The following private methods are mainly used to improve the
1552     * readability of the code. In order to let the Java compiler easily
1553     * inline them, we should not put many assertions or error checks in them.
1554     */
1555
1556    /**
1557     * Indicates whether a particular flag is set or not.
1558     */
1559    private boolean has(int f) {
1560        return (flags & f) != 0;
1561    }
1562
1563    /**
1564     * Match next character, signal error if failed.
1565     */
1566    private void accept(int ch, String s) {
1567        int testChar = temp[cursor++];
1568        if (has(COMMENTS))
1569            testChar = parsePastWhitespace(testChar);
1570        if (ch != testChar) {
1571        throw error(s);
1572        }
1573    }
1574
1575    /**
1576     * Mark the end of pattern with a specific character.
1577     */
1578    private void mark(int c) {
1579        temp[patternLength] = c;
1580    }
1581
1582    /**
1583     * Peek the next character, and do not advance the cursor.
1584     */
1585    private int peek() {
1586        int ch = temp[cursor];
1587        if (has(COMMENTS))
1588            ch = peekPastWhitespace(ch);
1589        return ch;
1590    }
1591
1592    /**
1593     * Read the next character, and advance the cursor by one.
1594     */
1595    private int read() {
1596        int ch = temp[cursor++];
1597        if (has(COMMENTS))
1598            ch = parsePastWhitespace(ch);
1599        return ch;
1600    }
1601
1602    /**
1603     * Read the next character, and advance the cursor by one,
1604     * ignoring the COMMENTS setting
1605     */
1606    private int readEscaped() {
1607        int ch = temp[cursor++];
1608        return ch;
1609    }
1610
1611    /**
1612     * Advance the cursor by one, and peek the next character.
1613     */
1614    private int next() {
1615        int ch = temp[++cursor];
1616        if (has(COMMENTS))
1617            ch = peekPastWhitespace(ch);
1618        return ch;
1619    }
1620
1621    /**
1622     * Advance the cursor by one, and peek the next character,
1623     * ignoring the COMMENTS setting
1624     */
1625    private int nextEscaped() {
1626        int ch = temp[++cursor];
1627        return ch;
1628    }
1629
1630    /**
1631     * If in xmode peek past whitespace and comments.
1632     */
1633    private int peekPastWhitespace(int ch) {
1634        while (ASCII.isSpace(ch) || ch == '#') {
1635            while (ASCII.isSpace(ch))
1636                ch = temp[++cursor];
1637            if (ch == '#') {
1638                ch = peekPastLine();
1639            }
1640        }
1641        return ch;
1642    }
1643
1644    /**
1645     * If in xmode parse past whitespace and comments.
1646     */
1647    private int parsePastWhitespace(int ch) {
1648        while (ASCII.isSpace(ch) || ch == '#') {
1649            while (ASCII.isSpace(ch))
1650                ch = temp[cursor++];
1651            if (ch == '#')
1652                ch = parsePastLine();
1653        }
1654        return ch;
1655    }
1656
1657    /**
1658     * xmode parse past comment to end of line.
1659     */
1660    private int parsePastLine() {
1661        int ch = temp[cursor++];
1662        while (ch != 0 && !isLineSeparator(ch))
1663            ch = temp[cursor++];
1664        return ch;
1665    }
1666
1667    /**
1668     * xmode peek past comment to end of line.
1669     */
1670    private int peekPastLine() {
1671        int ch = temp[++cursor];
1672        while (ch != 0 && !isLineSeparator(ch))
1673            ch = temp[++cursor];
1674        return ch;
1675    }
1676
1677    /**
1678     * Determines if character is a line separator in the current mode
1679     */
1680    private boolean isLineSeparator(int ch) {
1681        if (has(UNIX_LINES)) {
1682            return ch == '\n';
1683        } else {
1684            return (ch == '\n' ||
1685                    ch == '\r' ||
1686                    (ch|1) \u2029= 'โ€ฉ' ||
1687                    ch \u0085= 'ย…');
1688        }
1689    }
1690
1691    /**
1692     * Read the character after the next one, and advance the cursor by two.
1693     */
1694    private int skip() {
1695        int i = cursor;
1696        int ch = temp[i+1];
1697        cursor = i + 2;
1698        return ch;
1699    }
1700
1701    /**
1702     * Unread one next character, and retreat cursor by one.
1703     */
1704    private void unread() {
1705        cursor--;
1706    }
1707
1708    /**
1709     * Internal method used for handling all syntax errors. The pattern is
1710     * displayed with a pointer to aid in locating the syntax error.
1711     */
1712    private PatternSyntaxException error(String s) {
1713    return new PatternSyntaxException(s, normalizedPattern,  cursor - 1);
1714    }
1715
1716    /**
1717     * Determines if there is any supplementary character or unpaired
1718     * surrogate in the specified range.
1719     */
1720    private boolean findSupplementary(int start, int end) {
1721    for (int i = start; i < end; i++) {
1722        if (isSupplementary(temp[i]))
1723        return true;
1724    }
1725    return false;
1726    }
1727
1728    /**
1729     * Determines if the specified code point is a supplementary
1730     * character or unpaired surrogate.
1731     */
1732    private static final boolean isSupplementary(int ch) {
1733    return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || isSurrogate(ch);
1734    }
1735
1736    /**
1737     *  The following methods handle the main parsing. They are sorted
1738     *  according to their precedence order, the lowest one first.
1739     */
1740
1741    /**
1742     * The expression is parsed with branch nodes added for alternations.
1743     * This may be called recursively to parse sub expressions that may
1744     * contain alternations.
1745     */
1746    private Node expr(Node end) {
1747        Node prev = null;
1748        Node firstTail = null;
1749        Node branchConn = null;
1750
1751        for (;;) {
1752            Node node = sequence(end);
1753            Node nodeTail = root;      //double return
1754            if (prev == null) {
1755                prev = node;
1756                firstTail = nodeTail;
1757            } else {
1758            // Branch
1759            if (branchConn == null) {
1760                    branchConn = new BranchConn();
1761                    branchConn.next = end;
1762                }  
1763                if (node == end) {
1764                    // if the node returned from sequence() is "end"
1765            // we have an empty expr, set a null atom into
1766            // the branch to indicate to go "next" directly.
1767            node = null;
1768                } else {
1769            // the "tail.next" of each atom goes to branchConn
1770                    nodeTail.next = branchConn;
1771                }
1772            if (prev instanceof Branch) {
1773                    ((Branch)prev).add(node);
1774                } else {
1775            if (prev == end) {
1776                        prev = null;
1777                    } else {
1778                        // replace the "end" with "branchConn" at its tail.next
1779                        // when put the "prev" into the branch as the first atom.
1780                        firstTail.next = branchConn;
1781                    }
1782                    prev = new Branch(prev, node, branchConn);
1783                }
1784            }
1785            if (peek() != '|') {
1786                return prev;
1787            }
1788            next();
1789        }
1790    }
1791
1792    /**
1793     * Parsing of sequences between alternations.
1794     */
1795    private Node sequence(Node end) {
1796        Node head = null;
1797        Node tail = null;
1798        Node node = null;
1799    LOOP:
1800        for (;;) {
1801            int ch = peek();
1802            switch (ch) {
1803            case '(':
1804                // Because group handles its own closure,
1805                // we need to treat it differently
1806                node = group0();
1807                // Check for comment or flag group
1808                if (node == null)
1809                    continue;
1810                if (head == null)
1811                    head = node;
1812                else
1813                    tail.next = node;
1814                // Double return: Tail was returned in root
1815                tail = root;
1816                continue;
1817            case '[':
1818                node = clazz(true);
1819                break;
1820            case '\\':
1821                ch = nextEscaped();
1822                if (ch == 'p' || ch == 'P') {
1823                    boolean oneLetter = true;
1824            boolean comp = (ch == 'P');
1825                    ch = next(); // Consume { if present
1826                    if (ch != '{') {
1827                        unread();
1828                    } else {
1829                        oneLetter = false;
1830                    }
1831            node = family(oneLetter).maybeComplement(comp);
1832                } else {
1833                    unread();
1834                    node = atom();
1835                }
1836                break;
1837            case '^':
1838                next();
1839                if (has(MULTILINE)) {
1840                    if (has(UNIX_LINES))
1841                        node = new UnixCaret();
1842                    else
1843                        node = new Caret();
1844                } else {
1845                    node = new Begin();
1846                }
1847                break;
1848            case '$':
1849                next();
1850                if (has(UNIX_LINES))
1851                    node = new UnixDollar(has(MULTILINE));
1852                else
1853                    node = new Dollar(has(MULTILINE));
1854                break;
1855            case '.':
1856                next();
1857                if (has(DOTALL)) {
1858                    node = new All();
1859                } else {
1860                    if (has(UNIX_LINES))
1861                        node = new UnixDot();
1862                    else {
1863                        node = new Dot();
1864                    }
1865                }
1866                break;
1867            case '|':
1868            case ')':
1869                break LOOP;
1870            case ']': // Now interpreting dangling ] and } as literals
1871            case '}':
1872                node = atom();
1873                break;
1874            case '?':
1875            case '*':
1876            case '+':
1877                next();
1878                throw error("Dangling meta character '" + ((char)ch) + "'");
1879            case 0:
1880                if (cursor >= patternLength) {
1881                    break LOOP;
1882                }
1883                // Fall through
1884            default:
1885                node = atom();
1886                break;
1887            }
1888
1889            node = closure(node);
1890
1891            if (head == null) {
1892                head = tail = node;
1893            } else {
1894                tail.next = node;
1895                tail = node;
1896            }
1897        }
1898        if (head == null) {
1899            return end;
1900        }
1901        tail.next = end;
1902        root = tail;      //double return
1903        return head;
1904    }
1905
1906    /**
1907     * Parse and add a new Single or Slice.
1908     */
1909    private Node atom() {
1910        int first = 0;
1911        int prev = -1;
1912    boolean hasSupplementary = false;
1913        int ch = peek();
1914        for (;;) {
1915            switch (ch) {
1916            case '*':
1917            case '+':
1918            case '?':
1919            case '{':
1920                if (first > 1) {
1921                    cursor = prev;    // Unwind one character
1922                    first--;
1923                }
1924                break;
1925            case '$':
1926            case '.':
1927            case '^':
1928            case '(':
1929            case '[':
1930            case '|':
1931            case ')':
1932                break;
1933            case '\\':
1934                ch = nextEscaped();
1935                if (ch == 'p' || ch == 'P') { // Property
1936                    if (first > 0) { // Slice is waiting; handle it first
1937                        unread();
1938                        break;
1939                    } else { // No slice; just return the family node
1940            boolean comp = (ch == 'P');
1941            boolean oneLetter = true;
1942            ch = next(); // Consume { if present
1943            if (ch != '{')
1944                unread();
1945            else
1946                oneLetter = false;
1947            return family(oneLetter).maybeComplement(comp);
1948                    }
1949                }
1950                unread();
1951                prev = cursor;
1952                ch = escape(false, first == 0);
1953                if (ch >= 0) {
1954                    append(ch, first);
1955                    first++;
1956            if (isSupplementary(ch)) {
1957            hasSupplementary = true;
1958            }
1959                    ch = peek();
1960                    continue;
1961                } else if (first == 0) {
1962                    return root;
1963                }
1964                // Unwind meta escape sequence
1965                cursor = prev;
1966                break;
1967            case 0:
1968                if (cursor >= patternLength) {
1969                    break;
1970                }
1971                // Fall through
1972            default:
1973                prev = cursor;
1974                append(ch, first);
1975                first++;
1976        if (isSupplementary(ch)) {
1977            hasSupplementary = true;
1978        }
1979                ch = next();
1980                continue;
1981            }
1982            break;
1983        }
1984        if (first == 1) {
1985            return newSingle(buffer[0]);
1986        } else {
1987            return newSlice(buffer, first, hasSupplementary);
1988        }
1989    }
1990
1991    private void append(int ch, int len) {
1992        if (len >= buffer.length) {
1993            int[] tmp = new int[len+len];
1994            System.arraycopy(buffer, 0, tmp, 0, len);
1995            buffer = tmp;
1996        }
1997        buffer[len] = ch;
1998    }
1999
2000    /**
2001     * Parses a backref greedily, taking as many numbers as it
2002     * can. The first digit is always treated as a backref, but
2003     * multi digit numbers are only treated as a backref if at
2004     * least that many backrefs exist at this point in the regex.
2005     */
2006    private Node ref(int refNum) {
2007        boolean done = false;
2008        while(!done) {
2009            int ch = peek();
2010            switch(ch) {
2011        case '0':
2012        case '1':
2013        case '2':
2014        case '3':
2015        case '4':
2016        case '5':
2017        case '6':
2018        case '7':
2019        case '8':
2020        case '9':
2021        int newRefNum = (refNum * 10) + (ch - '0');
2022        // Add another number if it doesn't make a group
2023        // that doesn't exist
2024        if (capturingGroupCount - 1 < newRefNum) {
2025            done = true;
2026            break;
2027        }
2028        refNum = newRefNum;
2029        read();
2030        break;
2031        default:
2032        done = true;
2033        break;
2034            }
2035        }
2036        if (has(CASE_INSENSITIVE))
2037            return new CIBackRef(refNum, has(UNICODE_CASE));
2038        else
2039            return new BackRef(refNum);
2040    }
2041
2042    /**
2043     * Parses an escape sequence to determine the actual value that needs
2044     * to be matched.
2045     * If -1 is returned and create was true a new object was added to the tree
2046     * to handle the escape sequence.
2047     * If the returned value is greater than zero, it is the value that
2048     * matches the escape sequence.
2049     */
2050    private int escape(boolean inclass, boolean create) {
2051        int ch = skip();
2052        switch (ch) {
2053    case '0':
2054        return o();
2055    case '1':
2056    case '2':
2057    case '3':
2058    case '4':
2059    case '5':
2060    case '6':
2061    case '7':
2062    case '8':
2063    case '9':
2064        if (inclass) break;
2065        if (create) {
2066        root = ref((ch - '0'));
2067        }
2068        return -1;
2069    case 'A':
2070        if (inclass) break;
2071        if (create) root = new Begin();
2072        return -1;
2073    case 'B':
2074        if (inclass) break;
2075        if (create) root = new Bound(Bound.NONE);
2076        return -1;
2077    case 'C':
2078        break;
2079    case 'D':
2080        if (create) root = new Ctype(ASCII.DIGIT).complement();
2081        return -1;
2082    case 'E':
2083    case 'F':
2084        break;
2085    case 'G':
2086        if (inclass) break;
2087        if (create) root = new LastMatch();
2088        return -1;
2089    case 'H':
2090    case 'I':
2091    case 'J':
2092    case 'K':
2093    case 'L':
2094    case 'M':
2095    case 'N':
2096    case 'O':
2097    case 'P':
2098    case 'Q':
2099    case 'R':
2100        break;
2101    case 'S':
2102        if (create) root = new Ctype(ASCII.SPACE).complement();
2103        return -1;
2104    case 'T':
2105    case 'U':
2106    case 'V':
2107        break;
2108    case 'W':
2109        if (create) root = new Ctype(ASCII.WORD).complement();
2110        return -1;
2111    case 'X':
2112    case 'Y':
2113        break;
2114    case 'Z':
2115        if (inclass) break;
2116        if (create) {
2117        if (has(UNIX_LINES))
2118            root = new UnixDollar(false);
2119        else
2120            root = new Dollar(false);
2121        }
2122        return -1;
2123    case 'a':
2124        return '\007';
2125    case 'b':
2126        if (inclass) break;
2127        if (create) root = new Bound(Bound.BOTH);
2128        return -1;
2129    case 'c':
2130        return c();
2131    case 'd':
2132        if (create) root = new Ctype(ASCII.DIGIT);
2133        return -1;
2134    case 'e':
2135        return '\033';
2136    case 'f':
2137        return '\f';
2138    case 'g':
2139    case 'h':
2140    case 'i':
2141    case 'j':
2142    case 'k':
2143    case 'l':
2144    case 'm':
2145        break;
2146    case 'n':
2147        return '\n';
2148    case 'o':
2149    case 'p':
2150    case 'q':
2151        break;
2152    case 'r':
2153        return '\r';
2154    case 's':
2155        if (create) root = new Ctype(ASCII.SPACE);
2156        return -1;
2157    case 't':
2158        return '\t';
2159    case 'u':
2160        return u();
2161    case 'v':
2162        return '\013';
2163    case 'w':
2164        if (create) root = new Ctype(ASCII.WORD);
2165        return -1;
2166    case 'x':
2167        return x();
2168    case 'y':
2169        break;
2170    case 'z':
2171        if (inclass) break;
2172        if (create) root = new End();
2173        return -1;
2174    default:
2175        return ch;
2176        }
2177        throw error("Illegal/unsupported escape sequence");
2178    }
2179
2180    /**
2181     * Parse a character class, and return the node that matches it.
2182     *
2183     * Consumes a ] on the way out if consume is true. Usually consume
2184     * is true except for the case of [abc&&def] where def is a separate
2185     * right hand node with "understood" brackets.
2186     */
2187    private CharProperty clazz(boolean consume) {
2188        CharProperty prev = null;
2189        CharProperty node = null;
2190        BitClass bits = new BitClass();
2191        boolean include = true;
2192        boolean firstInClass = true;
2193        int ch = next();
2194        for (;;) {
2195            switch (ch) {
2196                case '^':
2197                    // Negates if first char in a class, otherwise literal
2198                    if (firstInClass) {
2199                        if (temp[cursor-1] != '[')
2200                            break;
2201                        ch = next();
2202                        include = !include;
2203                        continue;
2204                    } else {
2205                        // ^ not first in class, treat as literal
2206                        break;
2207                    }
2208                case '[':
2209                    firstInClass = false;
2210                    node = clazz(true);
2211                    if (prev == null)
2212                        prev = node;
2213                    else
2214                        prev = union(prev, node);
2215                    ch = peek();
2216                    continue;
2217                case '&':
2218                    firstInClass = false;
2219                    ch = next();
2220                    if (ch == '&') {
2221                        ch = next();
2222                        CharProperty rightNode = null;
2223                        while (ch != ']' && ch != '&') {
2224                            if (ch == '[') {
2225                                if (rightNode == null)
2226                                    rightNode = clazz(true);
2227                                else
2228                                    rightNode = union(rightNode, clazz(true));
2229                            } else { // abc&&def
2230                                unread();
2231                                rightNode = clazz(false);
2232                            }
2233                            ch = peek();
2234                        }
2235                        if (rightNode != null)
2236                            node = rightNode;
2237                        if (prev == null) {
2238                            if (rightNode == null)
2239                                throw error("Bad class syntax");
2240                            else
2241                                prev = rightNode;
2242                        } else {
2243                            prev = intersection(prev, node);
2244                        }
2245                    } else {
2246                        // treat as a literal &
2247                        unread();
2248                        break;
2249                    }
2250                    continue;
2251                case 0:
2252                    firstInClass = false;
2253                    if (cursor >= patternLength)
2254                        throw error("Unclosed character class");
2255                    break;
2256                case ']':
2257                    firstInClass = false;
2258                    if (prev != null) {
2259                        if (consume)
2260                            next();
2261                        return prev;
2262                    }
2263                    break;
2264                default:
2265                    firstInClass = false;
2266                    break;
2267            }
2268            node = range(bits);
2269            if (include) {
2270                if (prev == null) {
2271                    prev = node;
2272                } else {
2273                    if (prev != node)
2274                        prev = union(prev, node);
2275                }
2276            } else {
2277                if (prev == null) {
2278                    prev = node.complement();
2279                } else {
2280                    if (prev != node)
2281                        prev = setDifference(prev, node);
2282                }
2283            }
2284            ch = peek();
2285        }
2286    }
2287
2288    private CharProperty bitsOrSingle(BitClass bits, int ch) {
2289    /* Bits can only handle codepoints in [u+0000-u+00ff] range.
2290       Use "single" node instead of bits when dealing with unicode
2291       case folding for codepoints listed below.
2292       (1)Uppercase out of range: u+00ff, u+00b5 
2293          toUpperCase(u+00ff) -> u+0178
2294          toUpperCase(u+00b5) -> u+039c
2295           (2)LatinSmallLetterLongS u+17f
2296          toUpperCase(u+017f) -> u+0053
2297       (3)LatinSmallLetterDotlessI u+131
2298          toUpperCase(u+0131) -> u+0049
2299       (4)LatinCapitalLetterIWithDotAbove u+0130
2300          toLowerCase(u+0130) -> u+0069
2301       (5)KelvinSign u+212a
2302          toLowerCase(u+212a) ==> u+006B
2303       (6)AngstromSign u+212b
2304          toLowerCase(u+212b) ==> u+00e5
2305    */
2306    int d;
2307    if (ch < 256 &&
2308        !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
2309          (ch == 0xff || ch == 0xb5 ||
2310           ch == 0x49 || ch == 0x69 ||  //I and i
2311           ch == 0x53 || ch == 0x73 ||  //S and s
2312           ch == 0x4b || ch == 0x6b ||  //K and k
2313           ch == 0xc5 || ch == 0xe5)))  //A+ring
2314        return bits.add(ch, flags());
2315    return newSingle(ch);
2316    }
2317
2318    /**
2319     * Parse a single character or a character range in a character class
2320     * and return its representative node.
2321     */
2322    private CharProperty range(BitClass bits) {
2323        int ch = peek();
2324        if (ch == '\\') {
2325            ch = nextEscaped();
2326            if (ch == 'p' || ch == 'P') { // A property
2327                boolean comp = (ch == 'P');
2328                boolean oneLetter = true;
2329                // Consume { if present
2330                ch = next();
2331                if (ch != '{')
2332                    unread();
2333                else
2334                    oneLetter = false;
2335                return family(oneLetter).maybeComplement(comp);
2336            } else { // ordinary escape
2337                unread();
2338                ch = escape(true, true);
2339                if (ch == -1)
2340            return (CharProperty) root;
2341            }
2342        } else {
2343            ch = single();
2344        }
2345        if (ch >= 0) {
2346            if (peek() == '-') {
2347                int endRange = temp[cursor+1];
2348                if (endRange == '[') {
2349            return bitsOrSingle(bits, ch);
2350                }
2351                if (endRange != ']') {
2352                    next();
2353                    int m = single();
2354                    if (m < ch)
2355                        throw error("Illegal character range");
2356                    if (has(CASE_INSENSITIVE))
2357                        return caseInsensitiveRangeFor(ch, m);
2358                    else
2359                        return rangeFor(ch, m);
2360                }
2361            }
2362        return bitsOrSingle(bits, ch);
2363        }
2364        throw error("Unexpected character '"+((char)ch)+"'");
2365    }
2366
2367    private int single() {
2368        int ch = peek();
2369        switch (ch) {
2370        case '\\':
2371            return escape(true, false);
2372        default:
2373            next();
2374            return ch;
2375        }
2376    }
2377
2378    /**
2379     * Parses a Unicode character family and returns its representative node.
2380     */
2381    private CharProperty family(boolean singleLetter) {
2382        next();
2383        String name;
2384
2385        if (singleLetter) {
2386        int c = temp[cursor];
2387        if (!Character.isSupplementaryCodePoint(c)) {
2388        name = String.valueOf((char)c);
2389        } else {
2390        name = new String(temp, cursor, 1);
2391        }
2392            read();
2393        } else {
2394            int i = cursor;
2395            mark('}');
2396            while(read() != '}') {
2397            }
2398            mark('\000');
2399            int j = cursor;
2400            if (j > patternLength)
2401                throw error("Unclosed character family");
2402            if (i + 1 >= j)
2403                throw error("Empty character family");
2404            name = new String(temp, i, j-i-1);
2405        }
2406
2407        if (name.startsWith("In")) {
2408            return unicodeBlockPropertyFor(name.substring(2));
2409        } else {
2410        if (name.startsWith("Is"))
2411        name = name.substring(2);
2412        return charPropertyNodeFor(name);
2413    }
2414    }
2415
2416    /**
2417     * Returns a CharProperty matching all characters in a UnicodeBlock.
2418     */
2419    private CharProperty unicodeBlockPropertyFor(String name) {
2420    final Character.UnicodeBlock block;
2421        try {
2422            block = Character.UnicodeBlock.forName(name);
2423        } catch (IllegalArgumentException iae) {
2424            throw error("Unknown character block name {" + name + "}");
2425        }
2426    return new CharProperty() {
2427        boolean isSatisfiedBy(int ch) {
2428            return block == Character.UnicodeBlock.of(ch);}};
2429    }
2430
2431    /**
2432     * Returns a CharProperty matching all characters in a named property.
2433     */
2434    private CharProperty charPropertyNodeFor(String name) {
2435    CharProperty p = CharPropertyNames.charPropertyFor(name);
2436        if (p == null)
2437        throw error("Unknown character property name {" + name + "}");
2438    return p;
2439    }
2440
2441    /**
2442     * Parses a group and returns the head node of a set of nodes that process
2443     * the group. Sometimes a double return system is used where the tail is
2444     * returned in root.
2445     */
2446    private Node group0() {
2447        boolean capturingGroup = false;
2448        Node head = null;
2449        Node tail = null;
2450        int save = flags;
2451        root = null;
2452        int ch = next();
2453        if (ch == '?') {
2454            ch = skip();
2455            switch (ch) {
2456            case ':':   //  (?:xxx) pure group
2457                head = createGroup(true);
2458                tail = root;
2459                head.next = expr(tail);
2460                break;
2461            case '=':   // (?=xxx) and (?!xxx) lookahead
2462            case '!':
2463                head = createGroup(true);
2464                tail = root;
2465                head.next = expr(tail);
2466                if (ch == '=') {
2467                    head = tail = new Pos(head);
2468                } else {
2469                    head = tail = new Neg(head);
2470                }
2471                break;
2472            case '>':   // (?>xxx)  independent group
2473                head = createGroup(true);
2474                tail = root;
2475                head.next = expr(tail);
2476                head = tail = new Ques(head, INDEPENDENT);
2477                break;
2478            case '<':   // (?<xxx)  look behind
2479                ch = read();
2480        int start = cursor;
2481                head = createGroup(true);
2482                tail = root;
2483                head.next = expr(tail);
2484                tail.next = lookbehindEnd;
2485                TreeInfo info = new TreeInfo();
2486                head.study(info);
2487                if (info.maxValid == false) {
2488                    throw error("Look-behind group does not have "
2489                + "an obvious maximum length");
2490                }
2491        boolean hasSupplementary = findSupplementary(start, patternLength);
2492                if (ch == '=') {
2493                    head = tail = (hasSupplementary ?
2494                   new BehindS(head, info.maxLength,
2495                           info.minLength) :
2496                   new Behind(head, info.maxLength,
2497                          info.minLength));
2498                } else if (ch == '!') {
2499                    head = tail = (hasSupplementary ?
2500                   new NotBehindS(head, info.maxLength,
2501                          info.minLength) :
2502                   new NotBehind(head, info.maxLength,
2503                         info.minLength));
2504                } else {
2505                    throw error("Unknown look-behind group");
2506                }
2507                break;
2508            case '$':
2509            case '@':
2510        throw error("Unknown group type");
2511            default:    // (?xxx:) inlined match flags
2512                unread();
2513                addFlag();
2514                ch = read();
2515                if (ch == ')') {
2516                    return null;    // Inline modifier only
2517                }
2518                if (ch != ':') {
2519                    throw error("Unknown inline modifier");
2520                }
2521                head = createGroup(true);
2522                tail = root;
2523                head.next = expr(tail);
2524                break;
2525            }
2526        } else { // (xxx) a regular group
2527            capturingGroup = true;
2528            head = createGroup(false);
2529            tail = root;
2530            head.next = expr(tail);
2531        }
2532
2533        accept(')', "Unclosed group");
2534        flags = save;
2535
2536        // Check for quantifiers
2537        Node node = closure(head);
2538        if (node == head) { // No closure
2539            root = tail;
2540            return node;    // Dual return
2541        }
2542        if (head == tail) { // Zero length assertion
2543            root = node;
2544            return node;    // Dual return
2545        }
2546
2547        if (node instanceof Ques) {
2548            Ques ques = (Ques) node;
2549            if (ques.type == POSSESSIVE) {
2550                root = node;
2551                return node;
2552            }
2553            tail.next = new BranchConn();
2554            tail = tail.next;
2555            if (ques.type == GREEDY) {
2556                head = new Branch(head, null, tail);
2557            } else { // Reluctant quantifier
2558                head = new Branch(null, head, tail);
2559            }
2560            root = tail;
2561            return head;
2562        } else if (node instanceof Curly) {
2563            Curly curly = (Curly) node;
2564            if (curly.type == POSSESSIVE) {
2565                root = node;
2566                return node;
2567            }
2568            // Discover if the group is deterministic
2569            TreeInfo info = new TreeInfo();
2570            if (head.study(info)) { // Deterministic
2571                GroupTail temp = (GroupTail) tail;
2572                head = root = new GroupCurly(head.next, curly.cmin,
2573                                   curly.cmax, curly.type,
2574                                   ((GroupTail)tail).localIndex,
2575                                   ((GroupTail)tail).groupIndex,
2576                                             capturingGroup);
2577                return head;
2578            } else { // Non-deterministic
2579                int temp = ((GroupHead) head).localIndex;
2580                Loop loop;
2581                if (curly.type == GREEDY)
2582                    loop = new Loop(this.localCount, temp);
2583                else  // Reluctant Curly
2584                    loop = new LazyLoop(this.localCount, temp);
2585                Prolog prolog = new Prolog(loop);
2586                this.localCount += 1;
2587                loop.cmin = curly.cmin;
2588                loop.cmax = curly.cmax;
2589                loop.body = head;
2590                tail.next = loop;
2591                root = loop;
2592                return prolog; // Dual return
2593            }
2594        }
2595        throw error("Internal logic error");
2596    }
2597
2598    /**
2599     * Create group head and tail nodes using double return. If the group is
2600     * created with anonymous true then it is a pure group and should not
2601     * affect group counting.
2602     */
2603    private Node createGroup(boolean anonymous) {
2604        int localIndex = localCount++;
2605        int groupIndex = 0;
2606        if (!anonymous)
2607            groupIndex = capturingGroupCount++;
2608        GroupHead head = new GroupHead(localIndex);
2609        root = new GroupTail(localIndex, groupIndex);
2610        if (!anonymous && groupIndex < 10)
2611            groupNodes[groupIndex] = head;
2612        return head;
2613    }
2614
2615    /**
2616     * Parses inlined match flags and set them appropriately.
2617     */
2618    private void addFlag() {
2619        int ch = peek();
2620        for (;;) {
2621            switch (ch) {
2622            case 'i':
2623                flags |= CASE_INSENSITIVE;
2624                break;
2625            case 'm':
2626                flags |= MULTILINE;
2627                break;
2628            case 's':
2629                flags |= DOTALL;
2630                break;
2631            case 'd':
2632                flags |= UNIX_LINES;
2633                break;
2634            case 'u':
2635                flags |= UNICODE_CASE;
2636                break;
2637            case 'c':
2638                flags |= CANON_EQ;
2639                break;
2640            case 'x':
2641                flags |= COMMENTS;
2642                break;
2643            case '-': // subFlag then fall through
2644                ch = next();
2645                subFlag();
2646            default:
2647                return;
2648            }
2649            ch = next();
2650        }
2651    }
2652
2653    /**
2654     * Parses the second part of inlined match flags and turns off
2655     * flags appropriately.
2656     */
2657    private void subFlag() {
2658        int ch = peek();
2659        for (;;) {
2660            switch (ch) {
2661            case 'i':
2662                flags &= ~CASE_INSENSITIVE;
2663                break;
2664            case 'm':
2665                flags &= ~MULTILINE;
2666                break;
2667            case 's':
2668                flags &= ~DOTALL;
2669                break;
2670            case 'd':
2671                flags &= ~UNIX_LINES;
2672                break;
2673            case 'u':
2674                flags &= ~UNICODE_CASE;
2675                break;
2676            case 'c':
2677                flags &= ~CANON_EQ;
2678                break;
2679            case 'x':
2680                flags &= ~COMMENTS;
2681                break;
2682            default:
2683                return;
2684            }
2685            ch = next();
2686        }
2687    }
2688
2689    static final int MAX_REPS   = 0x7FFFFFFF;
2690
2691    static final int GREEDY     = 0;
2692
2693    static final int LAZY       = 1;
2694
2695    static final int POSSESSIVE = 2;
2696
2697    static final int INDEPENDENT = 3;
2698
2699    /**
2700     * Processes repetition. If the next character peeked is a quantifier
2701     * then new nodes must be appended to handle the repetition.
2702     * Prev could be a single or a group, so it could be a chain of nodes.
2703     */
2704    private Node closure(Node prev) {
2705        Node atom;
2706        int ch = peek();
2707        switch (ch) {
2708        case '?':
2709            ch = next();
2710            if (ch == '?') {
2711                next();
2712                return new Ques(prev, LAZY);
2713            } else if (ch == '+') {
2714                next();
2715                return new Ques(prev, POSSESSIVE);
2716            }
2717            return new Ques(prev, GREEDY);
2718        case '*':
2719            ch = next();
2720            if (ch == '?') {
2721                next();
2722                return new Curly(prev, 0, MAX_REPS, LAZY);
2723            } else if (ch == '+') {
2724                next();
2725                return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
2726            }
2727            return new Curly(prev, 0, MAX_REPS, GREEDY);
2728        case '+':
2729            ch = next();
2730            if (ch == '?') {
2731                next();
2732                return new Curly(prev, 1, MAX_REPS, LAZY);
2733            } else if (ch == '+') {
2734                next();
2735                return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
2736            }
2737            return new Curly(prev, 1, MAX_REPS, GREEDY);
2738        case '{':
2739            ch = temp[cursor+1];
2740            if (ASCII.isDigit(ch)) {
2741                skip();
2742                int cmin = 0;
2743                do {
2744                    cmin = cmin * 10 + (ch - '0');
2745                } while (ASCII.isDigit(ch = read()));
2746                int cmax = cmin;
2747                if (ch == ',') {
2748                    ch = read();
2749                    cmax = MAX_REPS;
2750                    if (ch != '}') {
2751                        cmax = 0;
2752                        while (ASCII.isDigit(ch)) {
2753                            cmax = cmax * 10 + (ch - '0');
2754                            ch = read();
2755                        }
2756                    }
2757                }
2758                if (ch != '}')
2759                    throw error("Unclosed counted closure");
2760                if (((cmin) | (cmax) | (cmax - cmin)) < 0)
2761                    throw error("Illegal repetition range");
2762                Curly curly;
2763                ch = peek();
2764                if (ch == '?') {
2765                    next();
2766                    curly = new Curly(prev, cmin, cmax, LAZY);
2767                } else if (ch == '+') {
2768                    next();
2769                    curly = new Curly(prev, cmin, cmax, POSSESSIVE);
2770                } else {
2771                    curly = new Curly(prev, cmin, cmax, GREEDY);
2772                }
2773                return curly;
2774            } else {
2775                throw error("Illegal repetition");
2776            }
2777        default:
2778            return prev;
2779        }
2780    }
2781
2782    /**
2783     *  Utility method for parsing control escape sequences.
2784     */
2785    private int c() {
2786        if (cursor < patternLength) {
2787            return read() ^ 64;
2788        }
2789        throw error("Illegal control escape sequence");
2790    }
2791
2792    /**
2793     *  Utility method for parsing octal escape sequences.
2794     */
2795    private int o() {
2796        int n = read();
2797        if (((n-'0')|('7'-n)) >= 0) {
2798            int m = read();
2799            if (((m-'0')|('7'-m)) >= 0) {
2800                int o = read();
2801                if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
2802                    return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
2803                }
2804                unread();
2805                return (n - '0') * 8 + (m - '0');
2806            }
2807            unread();
2808            return (n - '0');
2809        }
2810        throw error("Illegal octal escape sequence");
2811    }
2812
2813    /**
2814     *  Utility method for parsing hexadecimal escape sequences.
2815     */
2816    private int x() {
2817        int n = read();
2818        if (ASCII.isHexDigit(n)) {
2819            int m = read();
2820            if (ASCII.isHexDigit(m)) {
2821                return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
2822            }
2823        }
2824        throw error("Illegal hexadecimal escape sequence");
2825    }
2826
2827    /**
2828     *  Utility method for parsing unicode escape sequences.
2829     */
2830    private int u() {
2831        int n = 0;
2832        for (int i = 0; i < 4; i++) {
2833            int ch = read();
2834            if (!ASCII.isHexDigit(ch)) {
2835                throw error("Illegal Unicode escape sequence");
2836            }
2837            n = n * 16 + ASCII.toDigit(ch);
2838        }
2839        return n;
2840    }
2841
2842    //
2843    // Utility methods for code point support
2844    //
2845
2846    /**
2847     * Tests a surrogate value.
2848     */
2849    private static final boolean isSurrogate(int c) {
2850    return c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE;
2851    }
2852
2853    private static final int countChars(CharSequence seq, int index,
2854                    int lengthInCodePoints) {
2855    // optimization
2856    if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
2857        assert (index >= 0 && index < seq.length());
2858        return 1;
2859    }
2860    int length = seq.length();
2861    int x = index;
2862    if (lengthInCodePoints >= 0) {
2863        assert (index >= 0 && index < length);
2864        for (int i = 0; x < length && i < lengthInCodePoints; i++) {
2865        if (Character.isHighSurrogate(seq.charAt(x++))) {
2866            if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
2867            x++;
2868            }
2869        }
2870        }
2871        return x - index;
2872    }
2873
2874    assert (index >= 0 && index <= length);
2875    if (index == 0) {
2876        return 0;
2877    }
2878    int len = -lengthInCodePoints;
2879    for (int i = 0; x > 0 && i < len; i++) {
2880        if (Character.isLowSurrogate(seq.charAt(--x))) {
2881        if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
2882            x--;
2883        }
2884        }
2885    }
2886    return index - x;
2887    }
2888
2889    private static final int countCodePoints(CharSequence seq) {
2890    int length = seq.length();
2891    int n = 0;
2892    for (int i = 0; i < length; ) {
2893        n++;
2894        if (Character.isHighSurrogate(seq.charAt(i++))) {
2895        if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
2896            i++;
2897        }
2898        }
2899    }
2900    return n;
2901    }
2902
2903    /**
2904     *  Creates a bit vector for matching Latin-1 values. A normal BitClass
2905     *  never matches values above Latin-1, and a complemented BitClass always
2906     *  matches values above Latin-1.
2907     */
2908    private static final class BitClass extends BmpCharProperty {
2909    final boolean[] bits;
2910    BitClass() { bits = new boolean[256]; }
2911        private BitClass(boolean[] bits) { this.bits = bits; }
2912        BitClass add(int c, int flags) {
2913        assert c >= 0 && c <= 255;
2914            if ((flags & CASE_INSENSITIVE) != 0) {
2915                if (ASCII.isAscii(c)) {
2916            bits[ASCII.toUpper(c)] = true;
2917            bits[ASCII.toLower(c)] = true;
2918        } else if ((flags & UNICODE_CASE) != 0) {
2919            bits[Character.toLowerCase(c)] = true;
2920            bits[Character.toUpperCase(c)] = true;
2921        }
2922        }
2923        bits[c] = true;
2924        return this;
2925        }
2926    boolean isSatisfiedBy(int ch) {
2927        return ch < 256 && bits[ch];
2928        }
2929    }
2930
2931    /**
2932     *  Returns a suitably optimized, single character matcher.
2933     */
2934    private CharProperty newSingle(final int ch) {
2935    if (has(CASE_INSENSITIVE)) {
2936        int lower, upper;
2937        if (has(UNICODE_CASE)) {
2938        upper = Character.toUpperCase(ch);
2939        lower = Character.toLowerCase(upper);
2940        if (upper != lower)
2941            return new SingleU(lower);
2942        } else if (ASCII.isAscii(ch)) {
2943        lower = ASCII.toLower(ch);
2944        upper = ASCII.toUpper(ch);
2945        if (lower != upper)
2946            return new SingleI(lower, upper);
2947        }
2948    }
2949    if (isSupplementary(ch))
2950        return new SingleS(ch);    // Match a given Unicode character
2951    return new Single(ch);         // Match a given BMP character
2952    }
2953
2954    /**
2955     *  Utility method for creating a string slice matcher.
2956     */
2957    private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
2958        int[] tmp = new int[count];
2959        if (has(CASE_INSENSITIVE)) {
2960        if (has(UNICODE_CASE)) {
2961        for (int i = 0; i < count; i++) {
2962            tmp[i] = Character.toLowerCase(
2963                     Character.toUpperCase(buf[i]));
2964        }
2965        return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
2966        }
2967        for (int i = 0; i < count; i++) {
2968        tmp[i] = ASCII.toLower(buf[i]);
2969        }
2970        return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
2971    }
2972    for (int i = 0; i < count; i++) {
2973        tmp[i] = buf[i];
2974    }
2975    return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
2976    }
2977
2978    /**
2979     * The following classes are the building components of the object
2980     * tree that represents a compiled regular expression. The object tree
2981     * is made of individual elements that handle constructs in the Pattern.
2982     * Each type of object knows how to match its equivalent construct with
2983     * the match() method.
2984     */
2985
2986    /**
2987     * Base class for all node classes. Subclasses should override the match()
2988     * method as appropriate. This class is an accepting node, so its match()
2989     * always returns true.
2990     */
2991    static class Node extends Object {
2992        Node next;
2993        Node() {
2994            next = Pattern.accept;
2995        }
2996        /**
2997         * This method implements the classic accept node.
2998         */
2999        boolean match(Matcher matcher, int i, CharSequence seq) {
3000            matcher.last = i;
3001            matcher.groups[0] = matcher.first;
3002            matcher.groups[1] = matcher.last;
3003            return true;
3004        }
3005        /**
3006         * This method is good for all zero length assertions.
3007         */
3008        boolean study(TreeInfo info) {
3009            if (next != null) {
3010                return next.study(info);
3011            } else {
3012                return info.deterministic;
3013            }
3014        }
3015    }
3016
3017    static class LastNode extends Node {
3018        /**
3019         * This method implements the classic accept node with
3020         * the addition of a check to see if the match occurred
3021         * using all of the input.
3022         */
3023        boolean match(Matcher matcher, int i, CharSequence seq) {
3024            if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)
3025                return false;
3026            matcher.last = i;
3027            matcher.groups[0] = matcher.first;
3028            matcher.groups[1] = matcher.last;
3029            return true;
3030        }
3031    }
3032
3033    /**
3034     * Used for REs that can start anywhere within the input string.
3035     * This basically tries to match repeatedly at each spot in the
3036     * input string, moving forward after each try. An anchored search
3037     * or a BnM will bypass this node completely.
3038     */
3039    static class Start extends Node {
3040        int minLength;
3041        Start(Node node) {
3042            this.next = node;
3043            TreeInfo info = new TreeInfo();
3044            next.study(info);
3045            minLength = info.minLength;
3046        }
3047        boolean match(Matcher matcher, int i, CharSequence seq) {
3048            if (i > matcher.to - minLength) {
3049                matcher.hitEnd = true;
3050                return false;
3051            }
3052            boolean ret = false;
3053            int guard = matcher.to - minLength;
3054            for (; i <= guard; i++) {
3055                if (ret = next.match(matcher, i, seq))
3056                    break;
3057                if (i == guard)
3058                    matcher.hitEnd = true;
3059            }
3060            if (ret) {
3061                matcher.first = i;
3062                matcher.groups[0] = matcher.first;
3063                matcher.groups[1] = matcher.last;
3064            }
3065            return ret;
3066        }
3067        boolean study(TreeInfo info) {
3068            next.study(info);
3069            info.maxValid = false;
3070            info.deterministic = false;
3071            return false;
3072        }
3073    }
3074
3075    /*
3076     * StartS supports supplementary characters, including unpaired surrogates.
3077     */
3078    static final class StartS extends Start {
3079        StartS(Node node) {
3080        super(node);
3081        }
3082        boolean match(Matcher matcher, int i, CharSequence seq) {
3083            if (i > matcher.to - minLength) {
3084                matcher.hitEnd = true;
3085                return false;
3086            }
3087            boolean ret = false;
3088            int guard = matcher.to - minLength;
3089            while (i <= guard) {
3090                if ((ret = next.match(matcher, i, seq)) || i == guard)
3091                    break;
3092        // Optimization to move to the next character. This is
3093        // faster than countChars(seq, i, 1).
3094        if (Character.isHighSurrogate(seq.charAt(i++))) {
3095            if (i < seq.length() && Character.isLowSurrogate(seq.charAt(i))) {
3096            i++;
3097            }
3098        }
3099                if (i == guard)
3100                    matcher.hitEnd = true;
3101            }
3102            if (ret) {
3103                matcher.first = i;
3104                matcher.groups[0] = matcher.first;
3105                matcher.groups[1] = matcher.last;
3106            }
3107            return ret;
3108        }
3109    }
3110
3111    /**
3112     * Node to anchor at the beginning of input. This object implements the
3113     * match for a \A sequence, and the caret anchor will use this if not in
3114     * multiline mode.
3115     */
3116    static final class Begin extends Node {
3117        boolean match(Matcher matcher, int i, CharSequence seq) {
3118            int fromIndex = (matcher.anchoringBounds) ?
3119                matcher.from : 0;
3120            if (i == fromIndex && next.match(matcher, i, seq)) {
3121                matcher.first = i;
3122                matcher.groups[0] = i;
3123                matcher.groups[1] = matcher.last;
3124                return true;
3125            } else {
3126                return false;
3127            }
3128        }
3129    }
3130
3131    /**
3132     * Node to anchor at the end of input. This is the absolute end, so this
3133     * should not match at the last newline before the end as $ will.
3134     */
3135    static final class End extends Node {
3136        boolean match(Matcher matcher, int i, CharSequence seq) {
3137            int endIndex = (matcher.anchoringBounds) ?
3138                matcher.to : matcher.getTextLength();
3139            if (i == endIndex) {
3140                matcher.hitEnd = true;
3141                return next.match(matcher, i, seq);
3142            }
3143            return false;
3144        }
3145    }
3146
3147    /**
3148     * Node to anchor at the beginning of a line. This is essentially the
3149     * object to match for the multiline ^.
3150     */
3151    static final class Caret extends Node {
3152        boolean match(Matcher matcher, int i, CharSequence seq) {
3153            int startIndex = matcher.from;
3154            int endIndex = matcher.to;
3155            if (!matcher.anchoringBounds) {
3156                startIndex = 0;
3157                endIndex = matcher.getTextLength();
3158            }
3159            // Perl does not match ^ at end of input even after newline
3160            if (i == endIndex) {
3161                matcher.hitEnd = true;
3162                return false;
3163            }
3164            if (i > startIndex) {
3165                char ch = seq.charAt(i-1);
3166                if (ch != '\n' && ch != '\r'
3167                    && (ch|1) \u2029= 'โ€ฉ'
3168                    && ch \u0085= 'ย…' ) {
3169                    return false;
3170                }
3171                // Should treat /r/n as one newline
3172                if (ch == '\r' && seq.charAt(i) == '\n')
3173                    return false;
3174            }
3175            return next.match(matcher, i, seq);
3176        }
3177    }
3178
3179    /**
3180     * Node to anchor at the beginning of a line when in unixdot mode.
3181     */
3182    static final class UnixCaret extends Node {
3183        boolean match(Matcher matcher, int i, CharSequence seq) {
3184            int startIndex = matcher.from;
3185            int endIndex = matcher.to;
3186            if (!matcher.anchoringBounds) {
3187                startIndex = 0;
3188                endIndex = matcher.getTextLength();
3189            }
3190            // Perl does not match ^ at end of input even after newline
3191            if (i == endIndex) {
3192                matcher.hitEnd = true;
3193                return false;
3194            }
3195            if (i > startIndex) {
3196                char ch = seq.charAt(i-1);
3197                if (ch != '\n') {
3198                    return false;
3199                }
3200            }
3201            return next.match(matcher, i, seq);
3202        }
3203    }
3204
3205    /**
3206     * Node to match the location where the last match ended.
3207     * This is used for the \G construct.
3208     */
3209    static final class LastMatch extends Node {
3210        boolean match(Matcher matcher, int i, CharSequence seq) {
3211            if (i != matcher.oldLast)
3212                return false;
3213            return next.match(matcher, i, seq);
3214        }
3215    }
3216
3217    /**
3218     * Node to anchor at the end of a line or the end of input based on the
3219     * multiline mode.
3220     *
3221     * When not in multiline mode, the $ can only match at the very end
3222     * of the input, unless the input ends in a line terminator in which
3223     * it matches right before the last line terminator.
3224     *
3225     * Note that \r\n is considered an atomic line terminator.
3226     *
3227     * Like ^ the $ operator matches at a position, it does not match the
3228     * line terminators themselves.
3229     */
3230    static final class Dollar extends Node {
3231        boolean multiline;
3232        Dollar(boolean mul) {
3233            multiline = mul;
3234        }
3235        boolean match(Matcher matcher, int i, CharSequence seq) {
3236            int endIndex = (matcher.anchoringBounds) ?
3237                matcher.to : matcher.getTextLength();
3238            if (!multiline) {
3239                if (i < endIndex - 2)
3240                    return false;
3241                if (i == endIndex - 2) {
3242                    char ch = seq.charAt(i);
3243                    if (ch != '\r')
3244                        return false;
3245                    ch = seq.charAt(i + 1);
3246                    if (ch != '\n')
3247                        return false;
3248                }
3249            }
3250            // Matches before any line terminator; also matches at the
3251            // end of input
3252            // Before line terminator:
3253            // If multiline, we match here no matter what
3254            // If not multiline, fall through so that the end
3255            // is marked as hit; this must be a /r/n or a /n
3256            // at the very end so the end was hit; more input
3257            // could make this not match here
3258            if (i < endIndex) {
3259                char ch = seq.charAt(i);
3260                 if (ch == '\n') {
3261                     // No match between \r\n
3262                     if (i > 0 && seq.charAt(i-1) == '\r')
3263                         return false;
3264                     if (multiline)
3265                         return next.match(matcher, i, seq);
3266                 } else if (ch == '\r' || ch \u0085= 'ย…' ||
3267                            (ch|1) \u2029= 'โ€ฉ') {
3268                     if (multiline)
3269                         return next.match(matcher, i, seq);
3270                 } else { // No line terminator, no match
3271                     return false;
3272                 }
3273            }
3274            // Matched at current end so hit end
3275            matcher.hitEnd = true;
3276            // If a $ matches because of end of input, then more input
3277            // could cause it to fail!
3278            matcher.requireEnd = true;
3279            return next.match(matcher, i, seq);
3280        }
3281        boolean study(TreeInfo info) {
3282            next.study(info);
3283            return info.deterministic;
3284        }
3285    }
3286
3287    /**
3288     * Node to anchor at the end of a line or the end of input based on the
3289     * multiline mode when in unix lines mode.
3290     */
3291    static final class UnixDollar extends Node {
3292        boolean multiline;
3293        UnixDollar(boolean mul) {
3294            multiline = mul;
3295        }
3296        boolean match(Matcher matcher, int i, CharSequence seq) {
3297            int endIndex = (matcher.anchoringBounds) ?
3298                matcher.to : matcher.getTextLength();
3299            if (i < endIndex) {
3300                char ch = seq.charAt(i);
3301                if (ch == '\n') {
3302                    // If not multiline, then only possible to
3303                    // match at very end or one before end
3304                    if (multiline == false && i != endIndex - 1)
3305                        return false;
3306                    // If multiline return next.match without setting
3307                    // matcher.hitEnd
3308                    if (multiline)
3309                        return next.match(matcher, i, seq);
3310                } else {
3311                    return false;
3312                }
3313            }
3314            // Matching because at the end or 1 before the end;
3315            // more input could change this so set hitEnd
3316            matcher.hitEnd = true;
3317            // If a $ matches because of end of input, then more input
3318            // could cause it to fail!
3319            matcher.requireEnd = true;
3320            return next.match(matcher, i, seq);
3321        }
3322        boolean study(TreeInfo info) {
3323            next.study(info);
3324            return info.deterministic;
3325        }
3326    }
3327
3328    /**
3329     * Abstract node class to match one character satisfying some
3330     * boolean property.
3331     */
3332    private static abstract class CharProperty extends Node {
3333    abstract boolean isSatisfiedBy(int ch);
3334    CharProperty complement() {
3335        return new CharProperty() {
3336            boolean isSatisfiedBy(int ch) {
3337            return ! CharProperty.this.isSatisfiedBy(ch);}};
3338    }
3339    CharProperty maybeComplement(boolean complement) {
3340        return complement ? complement() : this;
3341    }
3342        boolean match(Matcher matcher, int i, CharSequence seq) {
3343        if (i < matcher.to) {
3344        int ch = Character.codePointAt(seq, i);
3345        return isSatisfiedBy(ch)
3346            && next.match(matcher, i+Character.charCount(ch), seq);
3347        } else {
3348                matcher.hitEnd = true;
3349        return false;
3350            }
3351        }
3352        boolean study(TreeInfo info) {
3353            info.minLength++;
3354            info.maxLength++;
3355            return next.study(info);
3356    }
3357    }
3358
3359    /**
3360     * Optimized version of CharProperty that works only for
3361     * properties never satisfied by Supplementary characters.
3362     */
3363    private static abstract class BmpCharProperty extends CharProperty {
3364    boolean match(Matcher matcher, int i, CharSequence seq) {
3365        if (i < matcher.to) {
3366        return isSatisfiedBy(seq.charAt(i))
3367            && next.match(matcher, i+1, seq);
3368        } else {
3369                matcher.hitEnd = true;
3370        return false;
3371            }
3372        }
3373    }
3374
3375    /**
3376     * Node class that matches a Supplementary Unicode character
3377     */
3378    static final class SingleS extends CharProperty {
3379        final int c;
3380        SingleS(int c) { this.c = c; }
3381    boolean isSatisfiedBy(int ch) {
3382        return ch == c;
3383    }
3384    }
3385
3386    /**
3387     * Optimization -- matches a given BMP character
3388     */
3389    static final class Single extends BmpCharProperty {
3390        final int c;
3391        Single(int c) { this.c = c; }
3392    boolean isSatisfiedBy(int ch) {
3393        return ch == c;
3394    }
3395    }
3396
3397    /**
3398     * Case insensitive matches a given BMP character
3399     */
3400    static final class SingleI extends BmpCharProperty {
3401        final int lower;
3402    final int upper;
3403        SingleI(int lower, int upper) {
3404        this.lower = lower;
3405        this.upper = upper;
3406    }
3407    boolean isSatisfiedBy(int ch) {
3408        return ch == lower || ch == upper;
3409    }
3410    }
3411
3412    /**
3413     * Unicode case insensitive matches a given Unicode character
3414     */
3415    static final class SingleU extends CharProperty {
3416        final int lower;
3417        SingleU(int lower) {
3418        this.lower = lower;
3419    }
3420    boolean isSatisfiedBy(int ch) {
3421        return lower == ch ||
3422        lower == Character.toLowerCase(Character.toUpperCase(ch));
3423    }
3424    }
3425
3426    /**
3427     * Node class that matches a Unicode category.
3428     */
3429    static final class Category extends CharProperty {
3430        final int typeMask;
3431        Category(int typeMask) { this.typeMask = typeMask; }
3432    boolean isSatisfiedBy(int ch) {
3433        return (typeMask & (1 << Character.getType(ch))) != 0;
3434    }
3435    }
3436
3437    /**
3438     * Node class that matches a POSIX type.
3439     */
3440    static final class Ctype extends BmpCharProperty {
3441        final int ctype;
3442        Ctype(int ctype) { this.ctype = ctype; }
3443    boolean isSatisfiedBy(int ch) {
3444        return ch < 128 && ASCII.isType(ch, ctype);
3445    }
3446    }
3447
3448    /**
3449     * Base class for all Slice nodes
3450     */
3451    static class SliceNode extends Node {
3452        int[] buffer;
3453        SliceNode(int[] buf) {
3454            buffer = buf;
3455        }
3456        boolean study(TreeInfo info) {
3457            info.minLength += buffer.length;
3458            info.maxLength += buffer.length;
3459            return next.study(info);
3460        }
3461    }
3462
3463    /**
3464     * Node class for a case sensitive/BMP-only sequence of literal
3465     * characters.
3466     */
3467    static final class Slice extends SliceNode {
3468        Slice(int[] buf) {
3469            super(buf);
3470        }
3471        boolean match(Matcher matcher, int i, CharSequence seq) {
3472            int[] buf = buffer;
3473            int len = buf.length;
3474            for (int j=0; j<len; j++) {
3475                if ((i+j) >= matcher.to) {
3476                    matcher.hitEnd = true;
3477                    return false;
3478                }
3479                if (buf[j] != seq.charAt(i+j))
3480                    return false;
3481            }
3482            return next.match(matcher, i+len, seq);
3483        }
3484    }
3485
3486    /**
3487     * Node class for a case_insensitive/BMP-only sequence of literal
3488     * characters.
3489     */
3490    static class SliceI extends SliceNode {
3491        SliceI(int[] buf) {
3492            super(buf);
3493        }
3494        boolean match(Matcher matcher, int i, CharSequence seq) {
3495            int[] buf = buffer;
3496            int len = buf.length;
3497            for (int j=0; j<len; j++) {
3498                if ((i+j) >= matcher.to) {
3499                    matcher.hitEnd = true;
3500                    return false;
3501                }
3502        int c = seq.charAt(i+j);
3503                if (buf[j] != c &&
3504            buf[j] != ASCII.toLower(c))
3505                    return false;
3506            }
3507            return next.match(matcher, i+len, seq);
3508        }
3509    }
3510
3511    /**
3512     * Node class for a unicode_case_insensitive/BMP-only sequence of
3513     * literal characters. Uses unicode case folding.
3514     */
3515    static final class SliceU extends SliceNode {
3516        SliceU(int[] buf) {
3517        super(buf);
3518        }
3519        boolean match(Matcher matcher, int i, CharSequence seq) {
3520            int[] buf = buffer;
3521            int len = buf.length;
3522            for (int j=0; j<len; j++) {
3523                if ((i+j) >= matcher.to) {
3524                    matcher.hitEnd = true;
3525                    return false;
3526                }
3527        int c = seq.charAt(i+j);
3528                if (buf[j] != c &&
3529            buf[j] != Character.toLowerCase(Character.toUpperCase(c)))
3530            return false;
3531            }
3532            return next.match(matcher, i+len, seq);
3533        }
3534    }
3535
3536    /**
3537     * Node class for a case sensitive sequence of literal characters
3538     * including supplementary characters.
3539     */
3540    static final class SliceS extends SliceNode {
3541        SliceS(int[] buf) {
3542            super(buf);
3543        }
3544        boolean match(Matcher matcher, int i, CharSequence seq) {
3545        int[] buf = buffer;
3546        int x = i;
3547        for (int j = 0; j < buf.length; j++) {
3548                if (x >= matcher.to) {
3549                    matcher.hitEnd = true;
3550                    return false;
3551                }
3552        int c = Character.codePointAt(seq, x);
3553        if (buf[j] != c)
3554            return false;
3555        x += Character.charCount(c);
3556        if (x > matcher.to) {
3557                    matcher.hitEnd = true;
3558            return false;
3559                }
3560        }
3561            return next.match(matcher, x, seq);
3562        }
3563    }
3564
3565    /**
3566     * Node class for a case insensitive sequence of literal characters
3567     * including supplementary characters.
3568     */
3569    static class SliceIS extends SliceNode {
3570        SliceIS(int[] buf) {
3571            super(buf);
3572        }
3573    int toLower(int c) {
3574        return ASCII.toLower(c);
3575    }
3576        boolean match(Matcher matcher, int i, CharSequence seq) {
3577        int[] buf = buffer;
3578        int x = i;
3579        for (int j = 0; j < buf.length; j++) {
3580                if (x >= matcher.to) {
3581                    matcher.hitEnd = true;
3582                    return false;
3583                }
3584        int c = Character.codePointAt(seq, x);
3585        if (buf[j] != c && buf[j] != toLower(c))
3586            return false;
3587        x += Character.charCount(c);
3588        if (x > matcher.to) {
3589                    matcher.hitEnd = true;
3590            return false;
3591                }
3592        }
3593            return next.match(matcher, x, seq);
3594        }
3595    }
3596
3597    /**
3598     * Node class for a case insensitive sequence of literal characters.
3599     * Uses unicode case folding.
3600     */
3601    static final class SliceUS extends SliceIS {
3602        SliceUS(int[] buf) {
3603        super(buf);
3604        }
3605    int toLower(int c) {
3606        return Character.toLowerCase(Character.toUpperCase(c));
3607    }
3608    }
3609
3610    private static boolean inRange(int lower, int ch, int upper) {
3611    return lower <= ch && ch <= upper;
3612    }
3613
3614    /**
3615     * Returns node for matching characters within an explicit value range.
3616     */
3617    private static CharProperty rangeFor(final int lower,
3618                     final int upper) {
3619    return new CharProperty() {
3620        boolean isSatisfiedBy(int ch) {
3621            return inRange(lower, ch, upper);}};
3622    }
3623
3624    /**
3625     * Returns node for matching characters within an explicit value
3626     * range in a case insensitive manner.
3627     */
3628    private CharProperty caseInsensitiveRangeFor(final int lower,
3629                         final int upper) {
3630    if (has(UNICODE_CASE))
3631        return new CharProperty() {
3632        boolean isSatisfiedBy(int ch) {
3633            if (inRange(lower, ch, upper))
3634            return true;
3635            int up = Character.toUpperCase(ch);
3636            return inRange(lower, up, upper) ||
3637                   inRange(lower, Character.toLowerCase(up), upper);}};
3638        return new CharProperty() { 
3639            boolean isSatisfiedBy(int ch) { 
3640                return inRange(lower, ch, upper) || 
3641                    ASCII.isAscii(ch) && 
3642                        (inRange(lower, ASCII.toUpper(ch), upper) || 
3643             inRange(lower, ASCII.toLower(ch), upper)); 
3644        }}; 
3645    }
3646
3647    /**
3648     * Implements the Unicode category ALL and the dot metacharacter when
3649     * in dotall mode.
3650     */
3651    static final class All extends CharProperty {
3652    boolean isSatisfiedBy(int ch) {
3653        return true;
3654    }
3655    }
3656
3657    /**
3658     * Node class for the dot metacharacter when dotall is not enabled.
3659     */
3660    static final class Dot extends CharProperty {
3661    boolean isSatisfiedBy(int ch) {
3662        return (ch != '\n' && ch != '\r'
3663                    && (ch|1) \u2029= 'โ€ฉ'
3664                    && ch \u0085= 'ย…');
3665        }
3666    }
3667
3668    /**
3669     * Node class for the dot metacharacter when dotall is not enabled
3670     * but UNIX_LINES is enabled.
3671     */
3672    static final class UnixDot extends CharProperty {
3673    boolean isSatisfiedBy(int ch) {
3674        return ch != '\n';
3675    }
3676    }
3677
3678    /**
3679     * The 0 or 1 quantifier. This one class implements all three types.
3680     */
3681    static final class Ques extends Node {
3682        Node atom;
3683        int type;
3684        Ques(Node node, int type) {
3685            this.atom = node;
3686            this.type = type;
3687        }
3688        boolean match(Matcher matcher, int i, CharSequence seq) {
3689            switch (type) {
3690            case GREEDY:
3691                return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
3692                    || next.match(matcher, i, seq);
3693            case LAZY:
3694                return next.match(matcher, i, seq)
3695                    || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
3696            case POSSESSIVE:
3697                if (atom.match(matcher, i, seq)) i = matcher.last;
3698                return next.match(matcher, i, seq);
3699            default:
3700                return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
3701            }
3702        }
3703        boolean study(TreeInfo info) {
3704            if (type != INDEPENDENT) {
3705                int minL = info.minLength;
3706                atom.study(info);
3707                info.minLength = minL;
3708                info.deterministic = false;
3709                return next.study(info);
3710            } else {
3711                atom.study(info);
3712                return next.study(info);
3713            }
3714        }
3715    }
3716
3717    /**
3718     * Handles the curly-brace style repetition with a specified minimum and
3719     * maximum occurrences. The * quantifier is handled as a special case.
3720     * This class handles the three types.
3721     */
3722    static final class Curly extends Node {
3723        Node atom;
3724        int type;
3725        int cmin;
3726        int cmax;
3727
3728        Curly(Node node, int cmin, int cmax, int type) {
3729            this.atom = node;
3730            this.type = type;
3731            this.cmin = cmin;
3732            this.cmax = cmax;
3733        }
3734        boolean match(Matcher matcher, int i, CharSequence seq) {
3735            int j;
3736            for (j = 0; j < cmin; j++) {
3737                if (atom.match(matcher, i, seq)) {
3738                    i = matcher.last;
3739                    continue;
3740                }
3741                return false;
3742            }
3743            if (type == GREEDY)
3744                return match0(matcher, i, j, seq);
3745            else if (type == LAZY)
3746                return match1(matcher, i, j, seq);
3747            else
3748                return match2(matcher, i, j, seq);
3749        }
3750        // Greedy match.
3751        // i is the index to start matching at
3752        // j is the number of atoms that have matched
3753        boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
3754            if (j >= cmax) {
3755                // We have matched the maximum... continue with the rest of
3756                // the regular expression
3757                return next.match(matcher, i, seq);
3758            }
3759            int backLimit = j;
3760            while (atom.match(matcher, i, seq)) {
3761                // k is the length of this match
3762                int k = matcher.last - i;
3763                if (k == 0) // Zero length match
3764                    break;
3765                // Move up index and number matched
3766                i = matcher.last;
3767                j++;
3768                // We are greedy so match as many as we can
3769                while (j < cmax) {
3770                    if (!atom.match(matcher, i, seq))
3771                        break;
3772                    if (i + k != matcher.last) {
3773                        if (match0(matcher, matcher.last, j+1, seq))
3774                            return true;
3775                        break;
3776                    }
3777                    i += k;
3778                    j++;
3779                }
3780                // Handle backing off if match fails
3781                while (j >= backLimit) {
3782                   if (next.match(matcher, i, seq))
3783                        return true;
3784                    i -= k;
3785                    j--;
3786                }
3787                return false;
3788            }
3789            return next.match(matcher, i, seq);
3790        }
3791        // Reluctant match. At this point, the minimum has been satisfied.
3792        // i is the index to start matching at
3793        // j is the number of atoms that have matched
3794        boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
3795            for (;;) {
3796                // Try finishing match without consuming any more
3797                if (next.match(matcher, i, seq))
3798                    return true;
3799                // At the maximum, no match found
3800                if (j >= cmax)
3801                    return false;
3802                // Okay, must try one more atom
3803                if (!atom.match(matcher, i, seq))
3804                    return false;
3805                // If we haven't moved forward then must break out
3806                if (i == matcher.last)
3807                    return false;
3808                // Move up index and number matched
3809                i = matcher.last;
3810                j++;
3811            }
3812        }
3813        boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
3814            for (; j < cmax; j++) {
3815                if (!atom.match(matcher, i, seq))
3816                    break;
3817                if (i == matcher.last)
3818                    break;
3819                i = matcher.last;
3820            }
3821            return next.match(matcher, i, seq);
3822        }
3823        boolean study(TreeInfo info) {
3824            // Save original info
3825            int minL = info.minLength;
3826            int maxL = info.maxLength;
3827            boolean maxV = info.maxValid;
3828            boolean detm = info.deterministic;
3829            info.reset();
3830
3831            atom.study(info);
3832
3833            int temp = info.minLength * cmin + minL;
3834            if (temp < minL) {
3835                temp = 0xFFFFFFF; // arbitrary large number
3836            }
3837            info.minLength = temp;
3838
3839            if (maxV & info.maxValid) {
3840                temp = info.maxLength * cmax + maxL;
3841                info.maxLength = temp;
3842                if (temp < maxL) {
3843                    info.maxValid = false;
3844                }
3845            } else {
3846                info.maxValid = false;
3847            }
3848
3849            if (info.deterministic && cmin == cmax)
3850                info.deterministic = detm;
3851            else
3852                info.deterministic = false;
3853
3854            return next.study(info);
3855        }
3856    }
3857
3858    /**
3859     * Handles the curly-brace style repetition with a specified minimum and
3860     * maximum occurrences in deterministic cases. This is an iterative
3861     * optimization over the Prolog and Loop system which would handle this
3862     * in a recursive way. The * quantifier is handled as a special case.
3863     * If capture is true then this class saves group settings and ensures
3864     * that groups are unset when backing off of a group match.
3865     */
3866    static final class GroupCurly extends Node {
3867        Node atom;
3868        int type;
3869        int cmin;
3870        int cmax;
3871        int localIndex;
3872        int groupIndex;
3873        boolean capture;
3874
3875        GroupCurly(Node node, int cmin, int cmax, int type, int local,
3876                   int group, boolean capture) {
3877            this.atom = node;
3878            this.type = type;
3879            this.cmin = cmin;
3880            this.cmax = cmax;
3881            this.localIndex = local;
3882            this.groupIndex = group;
3883            this.capture = capture;
3884        }
3885        boolean match(Matcher matcher, int i, CharSequence seq) {
3886            int[] groups = matcher.groups;
3887            int[] locals = matcher.locals;
3888            int save0 = locals[localIndex];
3889            int save1 = 0;
3890            int save2 = 0;
3891
3892            if (capture) {
3893                save1 = groups[groupIndex];
3894                save2 = groups[groupIndex+1];
3895            }
3896
3897            // Notify GroupTail there is no need to setup group info
3898            // because it will be set here
3899            locals[localIndex] = -1;
3900
3901            boolean ret = true;
3902            for (int j = 0; j < cmin; j++) {
3903                if (atom.match(matcher, i, seq)) {
3904                    if (capture) {
3905                        groups[groupIndex] = i;
3906                        groups[groupIndex+1] = matcher.last;
3907                    }
3908                    i = matcher.last;
3909                } else {
3910                    ret = false;
3911                    break;
3912                }
3913            }
3914            if (ret) {
3915                if (type == GREEDY) {
3916                    ret = match0(matcher, i, cmin, seq);
3917                } else if (type == LAZY) {
3918                    ret = match1(matcher, i, cmin, seq);
3919                } else {
3920                    ret = match2(matcher, i, cmin, seq);
3921                }
3922        }
3923            if (!ret) {
3924                locals[localIndex] = save0;
3925                if (capture) {
3926                    groups[groupIndex] = save1;
3927                    groups[groupIndex+1] = save2;
3928                }
3929        }
3930            return ret;
3931        }
3932        // Aggressive group match
3933        boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
3934            int[] groups = matcher.groups;
3935            int save0 = 0;
3936            int save1 = 0;
3937            if (capture) {
3938                save0 = groups[groupIndex];
3939                save1 = groups[groupIndex+1];
3940            }
3941            for (;;) {
3942                if (j >= cmax)
3943                    break;
3944                if (!atom.match(matcher, i, seq))
3945                    break;
3946                int k = matcher.last - i;
3947                if (k <= 0) {
3948                    if (capture) {
3949                        groups[groupIndex] = i;
3950                        groups[groupIndex+1] = i + k;
3951                    }
3952                    i = i + k;
3953                    break;
3954                }
3955                for (;;) {
3956                    if (capture) {
3957                        groups[groupIndex] = i;
3958                        groups[groupIndex+1] = i + k;
3959                    }
3960                    i = i + k;
3961                    if (++j >= cmax)
3962                        break;
3963                    if (!atom.match(matcher, i, seq))
3964                        break;
3965                    if (i + k != matcher.last) {
3966                        if (match0(matcher, i, j, seq))
3967                            return true;
3968                        break;
3969                    }
3970                }
3971                while (j > cmin) {
3972                    if (next.match(matcher, i, seq)) {
3973                        if (capture) {
3974                            groups[groupIndex+1] = i;
3975                            groups[groupIndex] = i - k;
3976                        }
3977                        i = i - k;
3978                        return true;
3979                    }
3980                    // backing off
3981                    if (capture) {
3982                        groups[groupIndex+1] = i;
3983                        groups[groupIndex] = i - k;
3984                    }
3985                    i = i - k;
3986                    j--;
3987                }
3988                break;
3989            }
3990            if (capture) {
3991                groups[groupIndex] = save0;
3992                groups[groupIndex+1] = save1;
3993            }
3994            return next.match(matcher, i, seq);
3995        }
3996        // Reluctant matching
3997        boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
3998            for (;;) {
3999                if (next.match(matcher, i, seq))
4000                    return true;
4001                if (j >= cmax)
4002                    return false;
4003                if (!atom.match(matcher, i, seq))
4004                    return false;
4005                if (i == matcher.last)
4006                    return false;
4007                if (capture) {
4008                    matcher.groups[groupIndex] = i;
4009                    matcher.groups[groupIndex+1] = matcher.last;
4010                }
4011                i = matcher.last;
4012                j++;
4013            }
4014        }
4015        // Possessive matching
4016        boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
4017            for (; j < cmax; j++) {
4018                if (!atom.match(matcher, i, seq)) {
4019                    break;
4020                }
4021                if (capture) {
4022                    matcher.groups[groupIndex] = i;
4023                    matcher.groups[groupIndex+1] = matcher.last;
4024                }
4025                if (i == matcher.last) {
4026                    break;
4027                }
4028                i = matcher.last;
4029            }
4030            return next.match(matcher, i, seq);
4031        }
4032        boolean study(TreeInfo info) {
4033            // Save original info
4034            int minL = info.minLength;
4035            int maxL = info.maxLength;
4036            boolean maxV = info.maxValid;
4037            boolean detm = info.deterministic;
4038            info.reset();
4039
4040            atom.study(info);
4041
4042            int temp = info.minLength * cmin + minL;
4043            if (temp < minL) {
4044                temp = 0xFFFFFFF; // Arbitrary large number
4045            }
4046            info.minLength = temp;
4047
4048            if (maxV & info.maxValid) {
4049                temp = info.maxLength * cmax + maxL;
4050                info.maxLength = temp;
4051                if (temp < maxL) {
4052                    info.maxValid = false;
4053                }
4054            } else {
4055                info.maxValid = false;
4056            }
4057
4058            if (info.deterministic && cmin == cmax) {
4059                info.deterministic = detm;
4060            } else {
4061                info.deterministic = false;
4062            }
4063
4064            return next.study(info);
4065        }
4066    }
4067
4068    /**
4069     * A Guard node at the end of each atom node in a Branch. It
4070     * serves the purpose of chaining the "match" operation to
4071     * "next" but not the "study", so we can collect the TreeInfo
4072     * of each atom node without including the TreeInfo of the
4073     * "next".
4074     */
4075    static final class BranchConn extends Node {
4076        BranchConn() {};
4077        boolean match(Matcher matcher, int i, CharSequence seq) {
4078            return next.match(matcher, i, seq);
4079        }
4080        boolean study(TreeInfo info) {
4081        return info.deterministic;
4082        }
4083    }
4084
4085    /**
4086     * Handles the branching of alternations. Note this is also used for
4087     * the ? quantifier to branch between the case where it matches once
4088     * and where it does not occur.
4089     */
4090    static final class Branch extends Node {
4091        Node[] atoms = new Node[2];
4092        int size = 2;
4093        Node conn;
4094        Branch(Node first, Node second, Node branchConn) {
4095            conn = branchConn;
4096            atoms[0] = first;            
4097            atoms[1] = second;
4098        }
4099
4100        void add(Node node) {
4101            if (size >= atoms.length) {
4102                Node[] tmp = new Node[atoms.length*2];
4103                System.arraycopy(atoms, 0, tmp, 0, atoms.length);
4104                atoms = tmp;
4105            }
4106            atoms[size++] = node;
4107        }
4108
4109        boolean match(Matcher matcher, int i, CharSequence seq) {
4110            for (int n = 0; n < size; n++) {
4111                if (atoms[n] == null) {
4112                    if (conn.next.match(matcher, i, seq))
4113                        return true;
4114        } else if (atoms[n].match(matcher, i, seq)) {
4115                    return true;
4116                }
4117            }
4118            return false;
4119        }
4120
4121        boolean study(TreeInfo info) {
4122            int minL = info.minLength;
4123            int maxL = info.maxLength;
4124            boolean maxV = info.maxValid;
4125
4126            int minL2 = Integer.MAX_VALUE; //arbitrary large enough num
4127            int maxL2 = -1;
4128            for (int n = 0; n < size; n++) {
4129                info.reset();
4130                if (atoms[n] != null)
4131                    atoms[n].study(info);
4132                minL2 = Math.min(minL2, info.minLength);
4133                maxL2 = Math.max(maxL2, info.maxLength);
4134                maxV = (maxV & info.maxValid);
4135            }
4136
4137            minL += minL2;
4138            maxL += maxL2;
4139
4140            info.reset();
4141            conn.next.study(info);
4142
4143            info.minLength += minL;
4144            info.maxLength += maxL;
4145            info.maxValid &= maxV;
4146            info.deterministic = false;
4147            return false;
4148        }
4149    }
4150
4151    /**
4152     * The GroupHead saves the location where the group begins in the locals
4153     * and restores them when the match is done.
4154     *
4155     * The matchRef is used when a reference to this group is accessed later
4156     * in the expression. The locals will have a negative value in them to
4157     * indicate that we do not want to unset the group if the reference
4158     * doesn't match.
4159     */
4160    static final class GroupHead extends Node {
4161        int localIndex;
4162        GroupHead(int localCount) {
4163            localIndex = localCount;
4164        }
4165        boolean match(Matcher matcher, int i, CharSequence seq) {
4166            int save = matcher.locals[localIndex];
4167            matcher.locals[localIndex] = i;
4168            boolean ret = next.match(matcher, i, seq);
4169            matcher.locals[localIndex] = save;
4170            return ret;
4171        }
4172        boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4173            int save = matcher.locals[localIndex];
4174            matcher.locals[localIndex] = ~i; // HACK
4175            boolean ret = next.match(matcher, i, seq);
4176            matcher.locals[localIndex] = save;
4177            return ret;
4178        }
4179    }
4180
4181    /**
4182     * Recursive reference to a group in the regular expression. It calls
4183     * matchRef because if the reference fails to match we would not unset
4184     * the group.
4185     */
4186    static final class GroupRef extends Node {
4187        GroupHead head;
4188        GroupRef(GroupHead head) {
4189            this.head = head;
4190        }
4191        boolean match(Matcher matcher, int i, CharSequence seq) {
4192            return head.matchRef(matcher, i, seq)
4193                && next.match(matcher, matcher.last, seq);
4194        }
4195        boolean study(TreeInfo info) {
4196            info.maxValid = false;
4197            info.deterministic = false;
4198            return next.study(info);
4199        }
4200    }
4201
4202    /**
4203     * The GroupTail handles the setting of group beginning and ending
4204     * locations when groups are successfully matched. It must also be able to
4205     * unset groups that have to be backed off of.
4206     *
4207     * The GroupTail node is also used when a previous group is referenced,
4208     * and in that case no group information needs to be set.
4209     */
4210    static final class GroupTail extends Node {
4211        int localIndex;
4212        int groupIndex;
4213        GroupTail(int localCount, int groupCount) {
4214            localIndex = localCount;
4215            groupIndex = groupCount + groupCount;
4216        }
4217        boolean match(Matcher matcher, int i, CharSequence seq) {
4218            int tmp = matcher.locals[localIndex];
4219            if (tmp >= 0) { // This is the normal group case.
4220                // Save the group so we can unset it if it
4221                // backs off of a match.
4222                int groupStart = matcher.groups[groupIndex];
4223                int groupEnd = matcher.groups[groupIndex+1];
4224
4225                matcher.groups[groupIndex] = tmp;
4226                matcher.groups[groupIndex+1] = i;
4227                if (next.match(matcher, i, seq)) {
4228                    return true;
4229                }
4230                matcher.groups[groupIndex] = groupStart;
4231                matcher.groups[groupIndex+1] = groupEnd;
4232                return false;
4233            } else {
4234                // This is a group reference case. We don't need to save any
4235                // group info because it isn't really a group.
4236                matcher.last = i;
4237                return true;
4238            }
4239        }
4240    }
4241
4242    /**
4243     * This sets up a loop to handle a recursive quantifier structure.
4244     */
4245    static final class Prolog extends Node {
4246        Loop loop;
4247        Prolog(Loop loop) {
4248            this.loop = loop;
4249        }
4250        boolean match(Matcher matcher, int i, CharSequence seq) {
4251            return loop.matchInit(matcher, i, seq);
4252        }
4253        boolean study(TreeInfo info) {
4254            return loop.study(info);
4255        }
4256    }
4257
4258    /**
4259     * Handles the repetition count for a greedy Curly. The matchInit
4260     * is called from the Prolog to save the index of where the group
4261     * beginning is stored. A zero length group check occurs in the
4262     * normal match but is skipped in the matchInit.
4263     */
4264    static class Loop extends Node {
4265        Node body;
4266        int countIndex; // local count index in matcher locals
4267        int beginIndex; // group beginning index
4268        int cmin, cmax;
4269        Loop(int countIndex, int beginIndex) {
4270            this.countIndex = countIndex;
4271            this.beginIndex = beginIndex;
4272        }
4273        boolean match(Matcher matcher, int i, CharSequence seq) {
4274            // Avoid infinite loop in zero-length case.
4275            if (i > matcher.locals[beginIndex]) {
4276                int count = matcher.locals[countIndex];
4277
4278                // This block is for before we reach the minimum
4279                // iterations required for the loop to match
4280                if (count < cmin) {
4281                    matcher.locals[countIndex] = count + 1;
4282                    boolean b = body.match(matcher, i, seq);
4283                    // If match failed we must backtrack, so
4284                    // the loop count should NOT be incremented
4285                    if (!b)
4286                        matcher.locals[countIndex] = count;
4287                    // Return success or failure since we are under
4288                    // minimum
4289                    return b;
4290                }
4291                // This block is for after we have the minimum
4292                // iterations required for the loop to match
4293                if (count < cmax) {
4294                    matcher.locals[countIndex] = count + 1;
4295                    boolean b = body.match(matcher, i, seq);
4296                    // If match failed we must backtrack, so
4297                    // the loop count should NOT be incremented
4298                    if (!b)
4299                        matcher.locals[countIndex] = count;
4300                    else
4301                        return true;
4302                }
4303            }
4304            return next.match(matcher, i, seq);
4305        }
4306        boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4307            int save = matcher.locals[countIndex];
4308            boolean ret = false;
4309            if (0 < cmin) {
4310                matcher.locals[countIndex] = 1;
4311                ret = body.match(matcher, i, seq);
4312            } else if (0 < cmax) {
4313                matcher.locals[countIndex] = 1;
4314                ret = body.match(matcher, i, seq);
4315                if (ret == false)
4316                    ret = next.match(matcher, i, seq);
4317            } else {
4318                ret = next.match(matcher, i, seq);
4319            }
4320            matcher.locals[countIndex] = save;
4321            return ret;
4322        }
4323        boolean study(TreeInfo info) {
4324            info.maxValid = false;
4325            info.deterministic = false;
4326            return false;
4327        }
4328    }
4329
4330    /**
4331     * Handles the repetition count for a reluctant Curly. The matchInit
4332     * is called from the Prolog to save the index of where the group
4333     * beginning is stored. A zero length group check occurs in the
4334     * normal match but is skipped in the matchInit.
4335     */
4336    static final class LazyLoop extends Loop {
4337        LazyLoop(int countIndex, int beginIndex) {
4338            super(countIndex, beginIndex);
4339        }
4340        boolean match(Matcher matcher, int i, CharSequence seq) {
4341            // Check for zero length group
4342            if (i > matcher.locals[beginIndex]) {
4343                int count = matcher.locals[countIndex];
4344                if (count < cmin) {
4345                    matcher.locals[countIndex] = count + 1;
4346                    boolean result = body.match(matcher, i, seq);
4347                    // If match failed we must backtrack, so
4348                    // the loop count should NOT be incremented
4349                    if (!result)
4350                        matcher.locals[countIndex] = count;
4351                    return result;
4352                }
4353                if (next.match(matcher, i, seq))
4354                    return true;
4355                if (count < cmax) {
4356                    matcher.locals[countIndex] = count + 1;
4357                    boolean result = body.match(matcher, i, seq);
4358                    // If match failed we must backtrack, so
4359                    // the loop count should NOT be incremented
4360                    if (!result)
4361                        matcher.locals[countIndex] = count;
4362                    return result;
4363                }
4364                return false;
4365            }
4366            return next.match(matcher, i, seq);
4367        }
4368        boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4369            int save = matcher.locals[countIndex];
4370            boolean ret = false;
4371            if (0 < cmin) {
4372                matcher.locals[countIndex] = 1;
4373                ret = body.match(matcher, i, seq);
4374            } else if (next.match(matcher, i, seq)) {
4375                ret = true;
4376            } else if (0 < cmax) {
4377                matcher.locals[countIndex] = 1;
4378                ret = body.match(matcher, i, seq);
4379            }
4380            matcher.locals[countIndex] = save;
4381            return ret;
4382        }
4383        boolean study(TreeInfo info) {
4384            info.maxValid = false;
4385            info.deterministic = false;
4386            return false;
4387        }
4388    }
4389
4390    /**
4391     * Refers to a group in the regular expression. Attempts to match
4392     * whatever the group referred to last matched.
4393     */
4394    static class BackRef extends Node {
4395        int groupIndex;
4396        BackRef(int groupCount) {
4397            super();
4398            groupIndex = groupCount + groupCount;
4399        }
4400        boolean match(Matcher matcher, int i, CharSequence seq) {
4401            int j = matcher.groups[groupIndex];
4402            int k = matcher.groups[groupIndex+1];
4403
4404            int groupSize = k - j;
4405
4406            // If the referenced group didn't match, neither can this
4407            if (j < 0)
4408                return false;
4409
4410            // If there isn't enough input left no match
4411            if (i + groupSize > matcher.to) {
4412                matcher.hitEnd = true;
4413                return false;
4414            }
4415
4416            // Check each new char to make sure it matches what the group
4417            // referenced matched last time around
4418            for (int index=0; index<groupSize; index++)
4419                if (seq.charAt(i+index) != seq.charAt(j+index))
4420                    return false;
4421
4422            return next.match(matcher, i+groupSize, seq);
4423        }
4424        boolean study(TreeInfo info) {
4425            info.maxValid = false;
4426            return next.study(info);
4427        }
4428    }
4429
4430    static class CIBackRef extends Node {
4431        int groupIndex;
4432    boolean doUnicodeCase;
4433        CIBackRef(int groupCount, boolean doUnicodeCase) {
4434            super();
4435            groupIndex = groupCount + groupCount;
4436        this.doUnicodeCase = doUnicodeCase;
4437        }
4438        boolean match(Matcher matcher, int i, CharSequence seq) {
4439            int j = matcher.groups[groupIndex];
4440            int k = matcher.groups[groupIndex+1];
4441
4442            int groupSize = k - j;
4443
4444            // If the referenced group didn't match, neither can this
4445            if (j < 0)
4446                return false;
4447
4448            // If there isn't enough input left no match
4449            if (i + groupSize > matcher.to) {
4450                matcher.hitEnd = true;
4451                return false;
4452            }
4453
4454            // Check each new char to make sure it matches what the group
4455            // referenced matched last time around
4456        int x = i;
4457            for (int index=0; index<groupSize; index++) {
4458                int c1 = Character.codePointAt(seq, x);
4459                int c2 = Character.codePointAt(seq, j);
4460                if (c1 != c2) {
4461            if (doUnicodeCase) {
4462            int cc1 = Character.toUpperCase(c1);
4463            int cc2 = Character.toUpperCase(c2);
4464            if (cc1 != cc2 &&
4465                Character.toLowerCase(cc1) != 
4466                Character.toLowerCase(cc2))
4467                return false;
4468            } else {
4469            if (ASCII.toLower(c1) != ASCII.toLower(c2))
4470                return false;
4471            }
4472        }
4473        x += Character.charCount(c1);
4474        j += Character.charCount(c2);
4475            }
4476
4477            return next.match(matcher, i+groupSize, seq);
4478        }
4479        boolean study(TreeInfo info) {
4480            info.maxValid = false;
4481            return next.study(info);
4482        }
4483    }
4484
4485    /**
4486     * Searches until the next instance of its atom. This is useful for
4487     * finding the atom efficiently without passing an instance of it
4488     * (greedy problem) and without a lot of wasted search time (reluctant
4489     * problem).
4490     */
4491    static final class First extends Node {
4492        Node atom;
4493        First(Node node) {
4494            this.atom = BnM.optimize(node);
4495        }
4496        boolean match(Matcher matcher, int i, CharSequence seq) {
4497            if (atom instanceof BnM) {
4498                return atom.match(matcher, i, seq)
4499                    && next.match(matcher, matcher.last, seq);
4500            }
4501            for (;;) {
4502                if (i > matcher.to) {
4503                    matcher.hitEnd = true;
4504                    return false;
4505                }
4506                if (atom.match(matcher, i, seq)) {
4507                    return next.match(matcher, matcher.last, seq);
4508                }
4509                i += countChars(seq, i, 1);
4510                matcher.first++;
4511            }
4512        }
4513        boolean study(TreeInfo info) {
4514            atom.study(info);
4515            info.maxValid = false;
4516            info.deterministic = false;
4517            return next.study(info);
4518        }
4519    }
4520
4521    static final class Conditional extends Node {
4522        Node cond, yes, not;
4523        Conditional(Node cond, Node yes, Node not) {
4524            this.cond = cond;
4525            this.yes = yes;
4526            this.not = not;
4527        }
4528        boolean match(Matcher matcher, int i, CharSequence seq) {
4529            if (cond.match(matcher, i, seq)) {
4530                return yes.match(matcher, i, seq);
4531            } else {
4532                return not.match(matcher, i, seq);
4533            }
4534        }
4535        boolean study(TreeInfo info) {
4536            int minL = info.minLength;
4537            int maxL = info.maxLength;
4538            boolean maxV = info.maxValid;
4539            info.reset();
4540            yes.study(info);
4541
4542            int minL2 = info.minLength;
4543            int maxL2 = info.maxLength;
4544            boolean maxV2 = info.maxValid;
4545            info.reset();
4546            not.study(info);
4547
4548            info.minLength = minL + Math.min(minL2, info.minLength);
4549            info.maxLength = maxL + Math.max(maxL2, info.maxLength);
4550            info.maxValid = (maxV & maxV2 & info.maxValid);
4551            info.deterministic = false;
4552            return next.study(info);
4553        }
4554    }
4555
4556    /**
4557     * Zero width positive lookahead.
4558     */
4559    static final class Pos extends Node {
4560        Node cond;
4561        Pos(Node cond) {
4562            this.cond = cond;
4563        }
4564        boolean match(Matcher matcher, int i, CharSequence seq) {
4565            int savedTo = matcher.to;
4566            boolean conditionMatched = false;
4567
4568            // Relax transparent region boundaries for lookahead
4569            if (matcher.transparentBounds)
4570                matcher.to = matcher.getTextLength();
4571            try {
4572                conditionMatched = cond.match(matcher, i, seq);
4573            } finally {
4574                // Reinstate region boundaries
4575                matcher.to = savedTo;
4576            }
4577            return conditionMatched && next.match(matcher, i, seq);
4578        }
4579    }
4580
4581    /**
4582     * Zero width negative lookahead.
4583     */
4584    static final class Neg extends Node {
4585        Node cond;
4586        Neg(Node cond) {
4587            this.cond = cond;
4588        }
4589        boolean match(Matcher matcher, int i, CharSequence seq) {
4590            int savedTo = matcher.to;
4591            boolean conditionMatched = false;
4592
4593            // Relax transparent region boundaries for lookahead
4594            if (matcher.transparentBounds)
4595                matcher.to = matcher.getTextLength();
4596            try {
4597                if (i < matcher.to) {
4598                    conditionMatched = !cond.match(matcher, i, seq);
4599                } else {
4600                    // If a negative lookahead succeeds then more input
4601                    // could cause it to fail!
4602                    matcher.requireEnd = true;
4603                    conditionMatched = !cond.match(matcher, i, seq);
4604                }
4605            } finally {
4606                // Reinstate region boundaries
4607                matcher.to = savedTo;
4608            }
4609            return conditionMatched && next.match(matcher, i, seq);
4610        }
4611    }
4612
4613    /**
4614     * For use with lookbehinds; matches the position where the lookbehind
4615     * was encountered.
4616     */
4617    static Node lookbehindEnd = new Node() {
4618        boolean match(Matcher matcher, int i, CharSequence seq) {
4619            return i == matcher.lookbehindTo;
4620        }
4621    };
4622
4623    /**
4624     * Zero width positive lookbehind.
4625     */
4626    static class Behind extends Node {
4627        Node cond;
4628        int rmax, rmin;
4629        Behind(Node cond, int rmax, int rmin) {
4630            this.cond = cond;
4631            this.rmax = rmax;
4632            this.rmin = rmin;
4633        }
4634
4635        boolean match(Matcher matcher, int i, CharSequence seq) {
4636            int savedFrom = matcher.from;
4637            boolean conditionMatched = false;
4638            int startIndex = (!matcher.transparentBounds) ?
4639                             matcher.from : 0;
4640            int from = Math.max(i - rmax, startIndex);
4641            // Set end boundary
4642            int savedLBT = matcher.lookbehindTo;
4643            matcher.lookbehindTo = i;
4644            // Relax transparent region boundaries for lookbehind
4645            if (matcher.transparentBounds)
4646                matcher.from = 0;
4647            for (int j = i - rmin; !conditionMatched && j >= from; j--) {
4648                conditionMatched = cond.match(matcher, j, seq); 
4649            }
4650            matcher.from = savedFrom;
4651            matcher.lookbehindTo = savedLBT;
4652            return conditionMatched && next.match(matcher, i, seq);
4653        }
4654    }
4655
4656    /**
4657     * Zero width positive lookbehind, including supplementary
4658     * characters or unpaired surrogates.
4659     */
4660    static final class BehindS extends Behind {
4661        BehindS(Node cond, int rmax, int rmin) {
4662            super(cond, rmax, rmin);
4663        }
4664        boolean match(Matcher matcher, int i, CharSequence seq) {
4665        int rmaxChars = countChars(seq, i, -rmax);
4666        int rminChars = countChars(seq, i, -rmin);
4667            int savedFrom = matcher.from;
4668            int startIndex = (!matcher.transparentBounds) ?
4669                             matcher.from : 0;
4670            boolean conditionMatched = false;
4671            int from = Math.max(i - rmaxChars, startIndex);
4672            // Set end boundary
4673            int savedLBT = matcher.lookbehindTo;
4674            matcher.lookbehindTo = i;
4675            // Relax transparent region boundaries for lookbehind
4676            if (matcher.transparentBounds)
4677                matcher.from = 0;
4678
4679            for (int j = i - rminChars;
4680                 !conditionMatched && j >= from;
4681                 j -= j>from ? countChars(seq, j, -1) : 1) {
4682                conditionMatched = cond.match(matcher, j, seq); 
4683            }
4684        matcher.from = savedFrom;
4685            matcher.lookbehindTo = savedLBT;
4686            return conditionMatched && next.match(matcher, i, seq);
4687        }
4688    }
4689
4690    /**
4691     * Zero width negative lookbehind.
4692     */
4693    static class NotBehind extends Node {
4694        Node cond;
4695        int rmax, rmin;
4696        NotBehind(Node cond, int rmax, int rmin) {
4697            this.cond = cond;
4698            this.rmax = rmax;
4699            this.rmin = rmin;
4700        }
4701
4702        boolean match(Matcher matcher, int i, CharSequence seq) {
4703            int savedLBT = matcher.lookbehindTo;
4704            int savedFrom = matcher.from;
4705            boolean conditionMatched = false;
4706            int startIndex = (!matcher.transparentBounds) ?
4707                             matcher.from : 0;
4708            int from = Math.max(i - rmax, startIndex);
4709            matcher.lookbehindTo = i;
4710            // Relax transparent region boundaries for lookbehind
4711            if (matcher.transparentBounds)
4712                matcher.from = 0;
4713            for (int j = i - rmin; !conditionMatched && j >= from; j--) {
4714                conditionMatched = cond.match(matcher, j, seq); 
4715            }
4716            // Reinstate region boundaries
4717            matcher.from = savedFrom;
4718            matcher.lookbehindTo = savedLBT;
4719            return !conditionMatched && next.match(matcher, i, seq);
4720        }
4721    }
4722
4723    /**
4724     * Zero width negative lookbehind, including supplementary
4725     * characters or unpaired surrogates.
4726     */
4727    static final class NotBehindS extends NotBehind {
4728        NotBehindS(Node cond, int rmax, int rmin) {
4729            super(cond, rmax, rmin);
4730        }
4731        boolean match(Matcher matcher, int i, CharSequence seq) {
4732        int rmaxChars = countChars(seq, i, -rmax);
4733        int rminChars = countChars(seq, i, -rmin);
4734            int savedFrom = matcher.from;
4735            int savedLBT = matcher.lookbehindTo;
4736            boolean conditionMatched = false;
4737            int startIndex = (!matcher.transparentBounds) ?
4738                             matcher.from : 0;
4739            int from = Math.max(i - rmaxChars, startIndex);
4740            matcher.lookbehindTo = i;
4741            // Relax transparent region boundaries for lookbehind
4742            if (matcher.transparentBounds)
4743                matcher.from = 0;
4744            for (int j = i - rminChars;
4745                 !conditionMatched && j >= from;
4746                 j -= j>from ? countChars(seq, j, -1) : 1) {
4747                conditionMatched = cond.match(matcher, j, seq);
4748            }
4749            //Reinstate region boundaries
4750            matcher.from = savedFrom;
4751            matcher.lookbehindTo = savedLBT;
4752            return !conditionMatched && next.match(matcher, i, seq);
4753        }
4754    }
4755
4756    /**
4757     * Returns the set union of two CharProperty nodes.
4758     */
4759    private static CharProperty union(final CharProperty lhs,
4760                      final CharProperty rhs) {
4761    return new CharProperty() {
4762        boolean isSatisfiedBy(int ch) {
4763            return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
4764    }
4765
4766    /**
4767     * Returns the set intersection of two CharProperty nodes.
4768     */
4769    private static CharProperty intersection(final CharProperty lhs,
4770                         final CharProperty rhs) {
4771    return new CharProperty() {
4772        boolean isSatisfiedBy(int ch) {
4773            return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
4774    }
4775
4776    /**
4777     * Returns the set difference of two CharProperty nodes.
4778     */
4779    private static CharProperty setDifference(final CharProperty lhs,
4780                          final CharProperty rhs) {
4781    return new CharProperty() {
4782        boolean isSatisfiedBy(int ch) {
4783            return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
4784    }
4785
4786    /**
4787     * Handles word boundaries. Includes a field to allow this one class to
4788     * deal with the different types of word boundaries we can match. The word
4789     * characters include underscores, letters, and digits. Non spacing marks
4790     * can are also part of a word if they have a base character, otherwise
4791     * they are ignored for purposes of finding word boundaries.
4792     */
4793    static final class Bound extends Node {
4794        static int LEFT = 0x1;
4795        static int RIGHT= 0x2;
4796        static int BOTH = 0x3;
4797        static int NONE = 0x4;
4798        int type;
4799        Bound(int n) {
4800            type = n;
4801        }
4802        int check(Matcher matcher, int i, CharSequence seq) {
4803            int ch;
4804            boolean left = false;
4805            int startIndex = matcher.from;
4806            int endIndex = matcher.to;
4807            if (matcher.transparentBounds) {
4808                startIndex = 0;
4809                endIndex = matcher.getTextLength();
4810            }
4811            if (i > startIndex) {
4812                ch = Character.codePointBefore(seq, i);
4813                left = (ch == '_' || Character.isLetterOrDigit(ch) ||
4814                    ((Character.getType(ch) == Character.NON_SPACING_MARK)
4815                     && hasBaseCharacter(matcher, i-1, seq)));
4816            }
4817            boolean right = false;
4818            if (i < endIndex) {
4819                ch = Character.codePointAt(seq, i);
4820                right = (ch == '_' || Character.isLetterOrDigit(ch) ||
4821                    ((Character.getType(ch) == Character.NON_SPACING_MARK)
4822                     && hasBaseCharacter(matcher, i, seq)));
4823            } else {
4824                // Tried to access char past the end
4825                matcher.hitEnd = true;
4826                // The addition of another char could wreck a boundary
4827                matcher.requireEnd = true;
4828            }
4829            return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
4830        }
4831        boolean match(Matcher matcher, int i, CharSequence seq) {
4832            return (check(matcher, i, seq) & type) > 0
4833                && next.match(matcher, i, seq);
4834        }
4835    }
4836
4837    /**
4838     * Non spacing marks only count as word characters in bounds calculations
4839     * if they have a base character.
4840     */
4841    private static boolean hasBaseCharacter(Matcher matcher, int i,
4842                                            CharSequence seq)
4843    {
4844        int start = (!matcher.transparentBounds) ?
4845            matcher.from : 0;
4846        for (int x=i; x >= start; x--) {
4847            int ch = Character.codePointAt(seq, x);
4848            if (Character.isLetterOrDigit(ch))
4849                return true;
4850            if (Character.getType(ch) == Character.NON_SPACING_MARK)
4851                continue;
4852            return false;
4853        }
4854        return false;
4855    }
4856
4857    /**
4858     * Attempts to match a slice in the input using the Boyer-Moore string
4859     * matching algorithm. The algorithm is based on the idea that the
4860     * pattern can be shifted farther ahead in the search text if it is
4861     * matched right to left.
4862     * <p>
4863     * The pattern is compared to the input one character at a time, from
4864     * the rightmost character in the pattern to the left. If the characters
4865     * all match the pattern has been found. If a character does not match,
4866     * the pattern is shifted right a distance that is the maximum of two
4867     * functions, the bad character shift and the good suffix shift. This
4868     * shift moves the attempted match position through the input more
4869     * quickly than a naive one position at a time check.
4870     * <p>
4871     * The bad character shift is based on the character from the text that
4872     * did not match. If the character does not appear in the pattern, the
4873     * pattern can be shifted completely beyond the bad character. If the
4874     * character does occur in the pattern, the pattern can be shifted to
4875     * line the pattern up with the next occurrence of that character.
4876     * <p>
4877     * The good suffix shift is based on the idea that some subset on the right
4878     * side of the pattern has matched. When a bad character is found, the
4879     * pattern can be shifted right by the pattern length if the subset does
4880     * not occur again in pattern, or by the amount of distance to the
4881     * next occurrence of the subset in the pattern.
4882     *
4883     * Boyer-Moore search methods adapted from code by Amy Yu.
4884     */
4885    static class BnM extends Node {
4886        int[] buffer;
4887        int[] lastOcc;
4888        int[] optoSft;
4889
4890        /**
4891         * Pre calculates arrays needed to generate the bad character
4892         * shift and the good suffix shift. Only the last seven bits
4893         * are used to see if chars match; This keeps the tables small
4894         * and covers the heavily used ASCII range, but occasionally
4895         * results in an aliased match for the bad character shift.
4896         */
4897        static Node optimize(Node node) {
4898            if (!(node instanceof Slice)) {
4899                return node;
4900            }
4901
4902            int[] src = ((Slice) node).buffer;
4903            int patternLength = src.length;
4904            // The BM algorithm requires a bit of overhead;
4905            // If the pattern is short don't use it, since
4906            // a shift larger than the pattern length cannot
4907            // be used anyway.
4908            if (patternLength < 4) {
4909                return node;
4910            }
4911            int i, j, k;
4912            int[] lastOcc = new int[128];
4913            int[] optoSft = new int[patternLength];
4914            // Precalculate part of the bad character shift
4915            // It is a table for where in the pattern each
4916            // lower 7-bit value occurs
4917            for (i = 0; i < patternLength; i++) {
4918                lastOcc[src[i]&0x7F] = i + 1;
4919            }
4920            // Precalculate the good suffix shift
4921            // i is the shift amount being considered
4922NEXT:       for (i = patternLength; i > 0; i--) {
4923                // j is the beginning index of suffix being considered
4924                for (j = patternLength - 1; j >= i; j--) {
4925                    // Testing for good suffix
4926                    if (src[j] == src[j-i]) {
4927                        // src[j..len] is a good suffix
4928                        optoSft[j-1] = i;
4929                    } else {
4930                        // No match. The array has already been
4931                        // filled up with correct values before.
4932                        continue NEXT;
4933                    }
4934                }
4935                // This fills up the remaining of optoSft
4936                // any suffix can not have larger shift amount
4937                // then its sub-suffix. Why???
4938                while (j > 0) {
4939                    optoSft[--j] = i;
4940                }
4941            }
4942            // Set the guard value because of unicode compression
4943            optoSft[patternLength-1] = 1;
4944        if (node instanceof SliceS)
4945        return new BnMS(src, lastOcc, optoSft, node.next);
4946            return new BnM(src, lastOcc, optoSft, node.next);
4947        }
4948        BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {
4949            this.buffer = src;
4950            this.lastOcc = lastOcc;
4951            this.optoSft = optoSft;
4952            this.next = next;
4953        }
4954        boolean match(Matcher matcher, int i, CharSequence seq) {
4955            int[] src = buffer;
4956            int patternLength = src.length;
4957            int last = matcher.to - patternLength;
4958
4959            // Loop over all possible match positions in text
4960NEXT:       while (i <= last) {
4961                // Loop over pattern from right to left
4962                for (int j = patternLength - 1; j >= 0; j--) {
4963                    int ch = seq.charAt(i+j);
4964                    if (ch != src[j]) {
4965                        // Shift search to the right by the maximum of the
4966                        // bad character shift and the good suffix shift
4967                        i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);
4968                        continue NEXT;
4969                    }
4970                }
4971                // Entire pattern matched starting at i
4972                matcher.first = i;
4973                boolean ret = next.match(matcher, i + patternLength, seq);
4974                if (ret) {
4975                    matcher.first = i;
4976                    matcher.groups[0] = matcher.first;
4977                    matcher.groups[1] = matcher.last;
4978                    return true;
4979                }
4980                i++;
4981            }
4982            // BnM is only used as the leading node in the unanchored case,
4983            // and it replaced its Start() which always searches to the end
4984            // if it doesn't find what it's looking for, so hitEnd is true.
4985            matcher.hitEnd = true;
4986            return false;
4987        }
4988        boolean study(TreeInfo info) {
4989            info.minLength += buffer.length;
4990            info.maxValid = false;
4991            return next.study(info);
4992        }
4993    }
4994
4995    /**
4996     * Supplementary support version of BnM(). Unpaired surrogates are
4997     * also handled by this class.
4998     */
4999    static final class BnMS extends BnM {
5000    int lengthInChars;
5001
5002    BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {
5003        super(src, lastOcc, optoSft, next);
5004        for (int x = 0; x < buffer.length; x++) {
5005        lengthInChars += Character.charCount(buffer[x]);
5006        }
5007    }
5008    boolean match(Matcher matcher, int i, CharSequence seq) {
5009            int[] src = buffer;
5010            int patternLength = src.length;
5011        int last = matcher.to - lengthInChars;
5012
5013            // Loop over all possible match positions in text
5014NEXT:       while (i <= last) {
5015                // Loop over pattern from right to left
5016        int ch;
5017                for (int j = countChars(seq, i, patternLength), x = patternLength - 1;
5018             j > 0; j -= Character.charCount(ch), x--) {
5019            ch = Character.codePointBefore(seq, i+j);
5020                    if (ch != src[x]) {
5021                        // Shift search to the right by the maximum of the
5022                        // bad character shift and the good suffix shift
5023                        int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);
5024            i += countChars(seq, i, n);
5025                        continue NEXT;
5026                    }
5027                }
5028                // Entire pattern matched starting at i
5029                matcher.first = i;
5030                boolean ret = next.match(matcher, i + lengthInChars, seq);
5031                if (ret) {
5032                    matcher.first = i;
5033                    matcher.groups[0] = matcher.first;
5034                    matcher.groups[1] = matcher.last;
5035                    return true;
5036                }
5037        i += countChars(seq, i, 1);
5038            }
5039            matcher.hitEnd = true;
5040            return false;
5041        }
5042    }
5043
5044///////////////////////////////////////////////////////////////////////////////
5045///////////////////////////////////////////////////////////////////////////////
5046
5047    /**
5048     *  This must be the very first initializer.
5049     */
5050    static Node accept = new Node();
5051
5052    static Node lastAccept = new LastNode();
5053
5054    private static class CharPropertyNames {
5055
5056    static CharProperty charPropertyFor(String name) {
5057        CharPropertyFactory m = map.get(name);
5058        return m == null ? null : m.make();
5059    }
5060
5061    private static abstract class CharPropertyFactory {
5062        abstract CharProperty make();
5063    }
5064
5065    private static void defCategory(String name,
5066                    final int typeMask) {
5067        map.put(name, new CharPropertyFactory() {
5068            CharProperty make() { return new Category(typeMask);}});
5069    }
5070
5071    private static void defRange(String name,
5072                     final int lower, final int upper) {
5073        map.put(name, new CharPropertyFactory() {
5074            CharProperty make() { return rangeFor(lower, upper);}});
5075    }
5076
5077    private static void defCtype(String name,
5078                     final int ctype) {
5079        map.put(name, new CharPropertyFactory() {
5080            CharProperty make() { return new Ctype(ctype);}});
5081    }
5082
5083    private static abstract class CloneableProperty
5084        extends CharProperty implements Cloneable
5085    {
5086        public CloneableProperty clone() {
5087        try {
5088            return (CloneableProperty) super.clone();
5089        } catch (CloneNotSupportedException e) {
5090            throw new AssertionError(e);
5091        }
5092        }
5093    }
5094
5095    private static void defClone(String name,
5096                     final CloneableProperty p) {
5097        map.put(name, new CharPropertyFactory() {
5098            CharProperty make() { return p.clone();}});
5099    }
5100
5101        private static final HashMap<String, CharPropertyFactory> map
5102        = new HashMap<String, CharPropertyFactory>();
5103
5104        static {
5105        // Unicode character property aliases, defined in
5106        // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
5107        defCategory("Cn", 1<<Character.UNASSIGNED);
5108            defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
5109            defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
5110            defCategory("Lt", 1<<Character.TITLECASE_LETTER);
5111            defCategory("Lm", 1<<Character.MODIFIER_LETTER);
5112            defCategory("Lo", 1<<Character.OTHER_LETTER);
5113            defCategory("Mn", 1<<Character.NON_SPACING_MARK);
5114            defCategory("Me", 1<<Character.ENCLOSING_MARK);
5115            defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
5116            defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
5117            defCategory("Nl", 1<<Character.LETTER_NUMBER);
5118            defCategory("No", 1<<Character.OTHER_NUMBER);
5119            defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
5120            defCategory("Zl", 1<<Character.LINE_SEPARATOR);
5121            defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
5122            defCategory("Cc", 1<<Character.CONTROL);
5123            defCategory("Cf", 1<<Character.FORMAT);
5124            defCategory("Co", 1<<Character.PRIVATE_USE);
5125            defCategory("Cs", 1<<Character.SURROGATE);
5126            defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
5127            defCategory("Ps", 1<<Character.START_PUNCTUATION);
5128            defCategory("Pe", 1<<Character.END_PUNCTUATION);
5129            defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
5130            defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
5131            defCategory("Sm", 1<<Character.MATH_SYMBOL);
5132            defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
5133            defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
5134            defCategory("So", 1<<Character.OTHER_SYMBOL);
5135            defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
5136            defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
5137            defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
5138                  (1<<Character.LOWERCASE_LETTER) |
5139                  (1<<Character.TITLECASE_LETTER) |
5140                  (1<<Character.MODIFIER_LETTER)  |
5141                  (1<<Character.OTHER_LETTER)));
5142            defCategory("M", ((1<<Character.NON_SPACING_MARK) |
5143                  (1<<Character.ENCLOSING_MARK)   |
5144                  (1<<Character.COMBINING_SPACING_MARK)));
5145        defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
5146                  (1<<Character.LETTER_NUMBER)        |
5147                  (1<<Character.OTHER_NUMBER)));
5148            defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
5149                  (1<<Character.LINE_SEPARATOR)  |
5150                  (1<<Character.PARAGRAPH_SEPARATOR)));
5151            defCategory("C", ((1<<Character.CONTROL)     |
5152                  (1<<Character.FORMAT)      |
5153                  (1<<Character.PRIVATE_USE) |
5154                  (1<<Character.SURROGATE))); // Other
5155            defCategory("P", ((1<<Character.DASH_PUNCTUATION)      |
5156                  (1<<Character.START_PUNCTUATION)     |
5157                  (1<<Character.END_PUNCTUATION)       |
5158                  (1<<Character.CONNECTOR_PUNCTUATION) |
5159                  (1<<Character.OTHER_PUNCTUATION)     |
5160                  (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
5161                  (1<<Character.FINAL_QUOTE_PUNCTUATION)));
5162            defCategory("S", ((1<<Character.MATH_SYMBOL)     |
5163                  (1<<Character.CURRENCY_SYMBOL) |
5164                  (1<<Character.MODIFIER_SYMBOL) |
5165                  (1<<Character.OTHER_SYMBOL)));
5166            defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
5167                   (1<<Character.LOWERCASE_LETTER) |
5168                   (1<<Character.TITLECASE_LETTER)));
5169            defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
5170                   (1<<Character.LOWERCASE_LETTER) |
5171                   (1<<Character.TITLECASE_LETTER) |
5172                   (1<<Character.MODIFIER_LETTER)  |
5173                   (1<<Character.OTHER_LETTER)     |
5174                   (1<<Character.DECIMAL_DIGIT_NUMBER)));
5175        defRange("L1", 0x00, 0xFF); // Latin-1
5176            map.put("all", new CharPropertyFactory() {
5177            CharProperty make() { return new All(); }});
5178
5179        // Posix regular expression character classes, defined in
5180        // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
5181            defRange("ASCII", 0x00, 0x7F);   // ASCII
5182        defCtype("Alnum", ASCII.ALNUM);  // Alphanumeric characters
5183            defCtype("Alpha", ASCII.ALPHA);  // Alphabetic characters
5184            defCtype("Blank", ASCII.BLANK);  // Space and tab characters
5185            defCtype("Cntrl", ASCII.CNTRL);  // Control characters
5186            defRange("Digit", '0', '9');     // Numeric characters
5187            defCtype("Graph", ASCII.GRAPH);  // printable and visible
5188            defRange("Lower", 'a', 'z');     // Lower-case alphabetic
5189            defRange("Print", 0x20, 0x7E);   // Printable characters
5190            defCtype("Punct", ASCII.PUNCT);  // Punctuation characters
5191            defCtype("Space", ASCII.SPACE);  // Space characters
5192            defRange("Upper", 'A', 'Z');     // Upper-case alphabetic
5193            defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
5194
5195        // Java character properties, defined by methods in Character.java
5196        defClone("javaLowerCase", new CloneableProperty() {
5197        boolean isSatisfiedBy(int ch) {
5198            return Character.isLowerCase(ch);}});
5199        defClone("javaUpperCase", new CloneableProperty() {
5200        boolean isSatisfiedBy(int ch) {
5201            return Character.isUpperCase(ch);}});
5202            defClone("javaTitleCase", new CloneableProperty() {
5203        boolean isSatisfiedBy(int ch) {
5204            return Character.isTitleCase(ch);}});
5205            defClone("javaDigit", new CloneableProperty() {
5206        boolean isSatisfiedBy(int ch) {
5207            return Character.isDigit(ch);}});
5208            defClone("javaDefined", new CloneableProperty() {
5209        boolean isSatisfiedBy(int ch) {
5210            return Character.isDefined(ch);}});
5211            defClone("javaLetter", new CloneableProperty() {
5212        boolean isSatisfiedBy(int ch) {
5213            return Character.isLetter(ch);}});
5214            defClone("javaLetterOrDigit", new CloneableProperty() {
5215        boolean isSatisfiedBy(int ch) {
5216            return Character.isLetterOrDigit(ch);}});
5217            defClone("javaJavaIdentifierStart", new CloneableProperty() {
5218        boolean isSatisfiedBy(int ch) {
5219            return Character.isJavaIdentifierStart(ch);}});
5220            defClone("javaJavaIdentifierPart", new CloneableProperty() {
5221        boolean isSatisfiedBy(int ch) {
5222            return Character.isJavaIdentifierPart(ch);}});
5223            defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
5224        boolean isSatisfiedBy(int ch) {
5225            return Character.isUnicodeIdentifierStart(ch);}});
5226            defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
5227        boolean isSatisfiedBy(int ch) {
5228            return Character.isUnicodeIdentifierPart(ch);}});
5229            defClone("javaIdentifierIgnorable", new CloneableProperty() {
5230        boolean isSatisfiedBy(int ch) {
5231            return Character.isIdentifierIgnorable(ch);}});
5232            defClone("javaSpaceChar", new CloneableProperty() {
5233        boolean isSatisfiedBy(int ch) {
5234            return Character.isSpaceChar(ch);}});
5235            defClone("javaWhitespace", new CloneableProperty() {
5236        boolean isSatisfiedBy(int ch) {
5237            return Character.isWhitespace(ch);}});
5238            defClone("javaISOControl", new CloneableProperty() {
5239        boolean isSatisfiedBy(int ch) {
5240            return Character.isISOControl(ch);}});
5241            defClone("javaMirrored", new CloneableProperty() {
5242        boolean isSatisfiedBy(int ch) {
5243            return Character.isMirrored(ch);}});
5244        }
5245    }
5246}
5247