| StopFilter.java |
1 package org.apache.lucene.analysis;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import java.io.IOException;
21 import java.util.Arrays;
22 import java.util.Set;
23
24 /**
25 * Removes stop words from a token stream.
26 */
27
28 public final class StopFilter extends TokenFilter {
29
30 private static boolean ENABLE_POSITION_INCREMENTS_DEFAULT = false;
31
32 private final CharArraySet stopWords;
33 private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT;
34
35 /**
36 * Construct a token stream filtering the given input.
37 */
38 public StopFilter(TokenStream input, String [] stopWords)
39 {
40 this(input, stopWords, false);
41 }
42
43 /**
44 * Constructs a filter which removes words from the input
45 * TokenStream that are named in the array of words.
46 */
47 public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
48 super(in);
49 this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
50 }
51
52
53 /**
54 * Construct a token stream filtering the given input.
55 * If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
56 * <code>makeStopSet()</code> was used to construct the set) it will be directly used
57 * and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
58 * directly controls case sensitivity.
59 * <p/>
60 * If <code>stopWords</code> is not an instance of {@link CharArraySet},
61 * a new CharArraySet will be constructed and <code>ignoreCase</code> will be
62 * used to specify the case sensitivity of that set.
63 *
64 * @param input
65 * @param stopWords The set of Stop Words.
66 * @param ignoreCase -Ignore case when stopping.
67 */
68 public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
69 {
70 super(input);
71 if (stopWords instanceof CharArraySet) {
72 this.stopWords = (CharArraySet)stopWords;
73 } else {
74 this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
75 this.stopWords.addAll(stopWords);
76 }
77 }
78
79 /**
80 * Constructs a filter which removes words from the input
81 * TokenStream that are named in the Set.
82 *
83 * @see #makeStopSet(java.lang.String[])
84 */
85 public StopFilter(TokenStream in, Set stopWords) {
86 this(in, stopWords, false);
87 }
88
89 /**
90 * Builds a Set from an array of stop words,
91 * appropriate for passing into the StopFilter constructor.
92 * This permits this stopWords construction to be cached once when
93 * an Analyzer is constructed.
94 *
95 * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
96 */
97 public static final Set makeStopSet(String[] stopWords) {
98 return makeStopSet(stopWords, false);
99 }
100
101 /**
102 *
103 * @param stopWords
104 * @param ignoreCase If true, all words are lower cased first.
105 * @return a Set containing the words
106 */
107 public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) {
108 CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
109 stopSet.addAll(Arrays.asList(stopWords));
110 return stopSet;
111 }
112
113 /**
114 * Returns the next input Token whose term() is not a stop word.
115 */
116 public final Token next(final Token reusableToken) throws IOException {
117 assert reusableToken != null;
118 // return the first non-stop word found
119 int skippedPositions = 0;
120 for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
121 if (!stopWords.contains(nextToken.termBuffer(), 0, nextToken.termLength())) {
122 if (enablePositionIncrements) {
123 nextToken.setPositionIncrement(nextToken.getPositionIncrement() + skippedPositions);
124 }
125 return nextToken;
126 }
127 skippedPositions += nextToken.getPositionIncrement();
128 }
129 // reached EOS -- return null
130 return null;
131 }
132
133 /**
134 * @see #setEnablePositionIncrementsDefault(boolean).
135 */
136 public static boolean getEnablePositionIncrementsDefault() {
137 return ENABLE_POSITION_INCREMENTS_DEFAULT;
138 }
139
140 /**
141 * Set the default position increments behavior of every StopFilter created from now on.
142 * <p>
143 * Note: behavior of a single StopFilter instance can be modified
144 * with {@link #setEnablePositionIncrements(boolean)}.
145 * This static method allows control over behavior of classes using StopFilters internally,
146 * for example {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer}.
147 * <p>
148 * Default : false.
149 * @see #setEnablePositionIncrements(boolean).
150 */
151 public static void setEnablePositionIncrementsDefault(boolean defaultValue) {
152 ENABLE_POSITION_INCREMENTS_DEFAULT = defaultValue;
153 }
154
155 /**
156 * @see #setEnablePositionIncrements(boolean).
157 */
158 public boolean getEnablePositionIncrements() {
159 return enablePositionIncrements;
160 }
161
162 /**
163 * Set to <code>true</code> to make <b>this</b> StopFilter enable position increments to result tokens.
164 * <p>
165 * When set, when a token is stopped (omitted), the position increment of
166 * the following token is incremented.
167 * <p>
168 * Default: see {@link #setEnablePositionIncrementsDefault(boolean)}.
169 */
170 public void setEnablePositionIncrements(boolean enable) {
171 this.enablePositionIncrements = enable;
172 }
173 }
174 | StopFilter.java |