| StandardTokenizer.java |
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.lucene.analysis.standard;
19
20 import java.io.IOException;
21 import java.io.Reader;
22
23 import org.apache.lucene.analysis.Token;
24 import org.apache.lucene.analysis.Tokenizer;
25
26 /** A grammar-based tokenizer constructed with JFlex
27 *
28 * <p> This should be a good tokenizer for most European-language documents:
29 *
30 * <ul>
31 * <li>Splits words at punctuation characters, removing punctuation. However, a
32 * dot that's not followed by whitespace is considered part of a token.
33 * <li>Splits words at hyphens, unless there's a number in the token, in which case
34 * the whole token is interpreted as a product number and is not split.
35 * <li>Recognizes email addresses and internet hostnames as one token.
36 * </ul>
37 *
38 * <p>Many applications have specific tokenizer needs. If this tokenizer does
39 * not suit your application, please consider copying this source code
40 * directory to your project and maintaining your own grammar-based tokenizer.
41 */
42
43 public class StandardTokenizer extends Tokenizer {
44 /** A private instance of the JFlex-constructed scanner */
45 private final StandardTokenizerImpl scanner;
46
47 public static final int ALPHANUM = 0;
48 public static final int APOSTROPHE = 1;
49 public static final int ACRONYM = 2;
50 public static final int COMPANY = 3;
51 public static final int EMAIL = 4;
52 public static final int HOST = 5;
53 public static final int NUM = 6;
54 public static final int CJ = 7;
55
56 /**
57 * @deprecated this solves a bug where HOSTs that end with '.' are identified
58 * as ACRONYMs. It is deprecated and will be removed in the next
59 * release.
60 */
61 public static final int ACRONYM_DEP = 8;
62
63 /** String token types that correspond to token type int constants */
64 public static final String [] TOKEN_TYPES = new String [] {
65 "<ALPHANUM>",
66 "<APOSTROPHE>",
67 "<ACRONYM>",
68 "<COMPANY>",
69 "<EMAIL>",
70 "<HOST>",
71 "<NUM>",
72 "<CJ>",
73 "<ACRONYM_DEP>"
74 };
75
76 /** @deprecated Please use {@link #TOKEN_TYPES} instead */
77 public static final String [] tokenImage = TOKEN_TYPES;
78
79 /**
80 * Specifies whether deprecated acronyms should be replaced with HOST type.
81 * This is false by default to support backward compatibility.
82 *<p/>
83 * See http://issues.apache.org/jira/browse/LUCENE-1068
84 *
85 * @deprecated this should be removed in the next release (3.0).
86 */
87 private boolean replaceInvalidAcronym = false;
88
89 void setInput(Reader reader) {
90 this.input = reader;
91 }
92
93 private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
94
95 /** Set the max allowed token length. Any token longer
96 * than this is skipped. */
97 public void setMaxTokenLength(int length) {
98 this.maxTokenLength = length;
99 }
100
101 /** @see #setMaxTokenLength */
102 public int getMaxTokenLength() {
103 return maxTokenLength;
104 }
105
106 /**
107 * Creates a new instance of the {@link StandardTokenizer}. Attaches the
108 * <code>input</code> to a newly created JFlex scanner.
109 */
110 public StandardTokenizer(Reader input) {
111 this.input = input;
112 this.scanner = new StandardTokenizerImpl(input);
113 }
114
115 /**
116 * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
117 * the <code>input</code> to the newly created JFlex scanner.
118 *
119 * @param input The input reader
120 * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms with HOST.
121 *
122 * See http://issues.apache.org/jira/browse/LUCENE-1068
123 */
124 public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
125 this.replaceInvalidAcronym = replaceInvalidAcronym;
126 this.input = input;
127 this.scanner = new StandardTokenizerImpl(input);
128 }
129
130 /*
131 * (non-Javadoc)
132 *
133 * @see org.apache.lucene.analysis.TokenStream#next()
134 */
135 public Token next(final Token reusableToken) throws IOException {
136 assert reusableToken != null;
137 int posIncr = 1;
138
139 while(true) {
140 int tokenType = scanner.getNextToken();
141
142 if (tokenType == StandardTokenizerImpl.YYEOF) {
143 return null;
144 }
145
146 if (scanner.yylength() <= maxTokenLength) {
147 reusableToken.clear();
148 reusableToken.setPositionIncrement(posIncr);
149 scanner.getText(reusableToken);
150 final int start = scanner.yychar();
151 reusableToken.setStartOffset(start);
152 reusableToken.setEndOffset(start+reusableToken.termLength());
153 // This 'if' should be removed in the next release. For now, it converts
154 // invalid acronyms to HOST. When removed, only the 'else' part should
155 // remain.
156 if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
157 if (replaceInvalidAcronym) {
158 reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
159 reusableToken.setTermLength(reusableToken.termLength() - 1); // remove extra '.'
160 } else {
161 reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
162 }
163 } else {
164 reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
165 }
166 return reusableToken;
167 } else
168 // When we skip a too-long term, we still increment the
169 // position increment
170 posIncr++;
171 }
172 }
173
174 /*
175 * (non-Javadoc)
176 *
177 * @see org.apache.lucene.analysis.TokenStream#reset()
178 */
179 public void reset() throws IOException {
180 super.reset();
181 scanner.yyreset(input);
182 }
183
184 public void reset(Reader reader) throws IOException {
185 input = reader;
186 reset();
187 }
188
189 /**
190 * Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
191 * when they should have been labeled as hosts instead.
192 * @return true if StandardTokenizer now returns these tokens as Hosts, otherwise false
193 *
194 * @deprecated Remove in 3.X and make true the only valid value
195 */
196 public boolean isReplaceInvalidAcronym() {
197 return replaceInvalidAcronym;
198 }
199
200 /**
201 *
202 * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
203 * @deprecated Remove in 3.X and make true the only valid value
204 *
205 * See https://issues.apache.org/jira/browse/LUCENE-1068
206 */
207 public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
208 this.replaceInvalidAcronym = replaceInvalidAcronym;
209 }
210 }
211 | StandardTokenizer.java |