1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.lucene.analysis.standard;
19  
20  import java.io.IOException;
21  import java.io.Reader;
22  
23  import org.apache.lucene.analysis.Token;
24  import org.apache.lucene.analysis.Tokenizer;
25  
26  /** A grammar-based tokenizer constructed with JFlex
27   *
28   * <p> This should be a good tokenizer for most European-language documents:
29   *
30   * <ul>
31   *   <li>Splits words at punctuation characters, removing punctuation. However, a 
32   *     dot that's not followed by whitespace is considered part of a token.
33   *   <li>Splits words at hyphens, unless there's a number in the token, in which case
34   *     the whole token is interpreted as a product number and is not split.
35   *   <li>Recognizes email addresses and internet hostnames as one token.
36   * </ul>
37   *
38   * <p>Many applications have specific tokenizer needs.  If this tokenizer does
39   * not suit your application, please consider copying this source code
40   * directory to your project and maintaining your own grammar-based tokenizer.
41   */
42  
43  public class StandardTokenizer extends Tokenizer {
44    /** A private instance of the JFlex-constructed scanner */
45    private final StandardTokenizerImpl scanner;
46  
47    public static final int ALPHANUM          = 0;
48    public static final int APOSTROPHE        = 1;
49    public static final int ACRONYM           = 2;
50    public static final int COMPANY           = 3;
51    public static final int EMAIL             = 4;
52    public static final int HOST              = 5;
53    public static final int NUM               = 6;
54    public static final int CJ                = 7;
55  
56    /**
57     * @deprecated this solves a bug where HOSTs that end with '.' are identified
58     *             as ACRONYMs. It is deprecated and will be removed in the next
59     *             release.
60     */
61    public static final int ACRONYM_DEP       = 8;
62  
63    /** String token types that correspond to token type int constants */
64    public static final String [] TOKEN_TYPES = new String [] {
65      "<ALPHANUM>",
66      "<APOSTROPHE>",
67      "<ACRONYM>",
68      "<COMPANY>",
69      "<EMAIL>",
70      "<HOST>",
71      "<NUM>",
72      "<CJ>",
73      "<ACRONYM_DEP>"
74    };
75  
76    /** @deprecated Please use {@link #TOKEN_TYPES} instead */
77    public static final String [] tokenImage = TOKEN_TYPES;
78  
79    /**
80     * Specifies whether deprecated acronyms should be replaced with HOST type.
81     * This is false by default to support backward compatibility.
82     *<p/>
83     * See http://issues.apache.org/jira/browse/LUCENE-1068
84     * 
85     * @deprecated this should be removed in the next release (3.0).
86     */
87    private boolean replaceInvalidAcronym = false;
88      
89    void setInput(Reader reader) {
90      this.input = reader;
91    }
92  
93    private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
94  
95    /** Set the max allowed token length.  Any token longer
96     *  than this is skipped. */
97    public void setMaxTokenLength(int length) {
98      this.maxTokenLength = length;
99    }
100 
101   /** @see #setMaxTokenLength */
102   public int getMaxTokenLength() {
103     return maxTokenLength;
104   }
105 
106     /**
107      * Creates a new instance of the {@link StandardTokenizer}. Attaches the
108      * <code>input</code> to a newly created JFlex scanner.
109      */
110     public StandardTokenizer(Reader input) {
111         this.input = input;
112         this.scanner = new StandardTokenizerImpl(input);
113     }
114 
115   /**
116    * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}.  Attaches
117    * the <code>input</code> to the newly created JFlex scanner.
118    *
119    * @param input The input reader
120    * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms with HOST.
121    *
122    * See http://issues.apache.org/jira/browse/LUCENE-1068
123    */
124   public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
125     this.replaceInvalidAcronym = replaceInvalidAcronym;
126     this.input = input;
127     this.scanner = new StandardTokenizerImpl(input);
128   }
129 
130   /*
131    * (non-Javadoc)
132    *
133    * @see org.apache.lucene.analysis.TokenStream#next()
134    */
135   public Token next(final Token reusableToken) throws IOException {
136       assert reusableToken != null;
137       int posIncr = 1;
138 
139       while(true) {
140     int tokenType = scanner.getNextToken();
141 
142     if (tokenType == StandardTokenizerImpl.YYEOF) {
143         return null;
144     }
145 
146         if (scanner.yylength() <= maxTokenLength) {
147           reusableToken.clear();
148           reusableToken.setPositionIncrement(posIncr);
149           scanner.getText(reusableToken);
150           final int start = scanner.yychar();
151           reusableToken.setStartOffset(start);
152           reusableToken.setEndOffset(start+reusableToken.termLength());
153           // This 'if' should be removed in the next release. For now, it converts
154           // invalid acronyms to HOST. When removed, only the 'else' part should
155           // remain.
156           if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
157             if (replaceInvalidAcronym) {
158               reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
159               reusableToken.setTermLength(reusableToken.termLength() - 1); // remove extra '.'
160             } else {
161               reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
162             }
163           } else {
164             reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
165           }
166           return reusableToken;
167         } else
168           // When we skip a too-long term, we still increment the
169           // position increment
170           posIncr++;
171       }
172     }
173 
174     /*
175      * (non-Javadoc)
176      *
177      * @see org.apache.lucene.analysis.TokenStream#reset()
178      */
179     public void reset() throws IOException {
180     super.reset();
181     scanner.yyreset(input);
182     }
183 
184     public void reset(Reader reader) throws IOException {
185         input = reader;
186         reset();
187     }
188 
189   /**
190    * Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
191    * when they should have been labeled as hosts instead.
192    * @return true if StandardTokenizer now returns these tokens as Hosts, otherwise false
193    *
194    * @deprecated Remove in 3.X and make true the only valid value
195    */
196   public boolean isReplaceInvalidAcronym() {
197     return replaceInvalidAcronym;
198   }
199 
200   /**
201    *
202    * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
203    * @deprecated Remove in 3.X and make true the only valid value
204    *
205    * See https://issues.apache.org/jira/browse/LUCENE-1068
206    */
207   public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
208     this.replaceInvalidAcronym = replaceInvalidAcronym;
209   }
210 }
211