1   package org.apache.lucene.search;
2   
3   /**
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import org.apache.lucene.index.IndexReader;
21  import org.apache.lucene.index.Term;
22  import org.apache.lucene.util.PriorityQueue;
23  import org.apache.lucene.util.ToStringUtils;
24  
25  import java.io.IOException;
26  
27  /** Implements the fuzzy search query. The similiarity measurement
28   * is based on the Levenshtein (edit distance) algorithm.
29   */
30  public class FuzzyQuery extends MultiTermQuery {
31    
32    public final static float defaultMinSimilarity = 0.5f;
33    public final static int defaultPrefixLength = 0;
34    
35    private float minimumSimilarity;
36    private int prefixLength;
37    
38    /**
39     * Create a new FuzzyQuery that will match terms with a similarity 
40     * of at least <code>minimumSimilarity</code> to <code>term</code>.
41     * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
42     * of that length is also required.
43     * 
44     * @param term the term to search for
45     * @param minimumSimilarity a value between 0 and 1 to set the required similarity
46     *  between the query term and the matching terms. For example, for a
47     *  <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
48     *  as the query term is considered similar to the query term if the edit distance
49     *  between both terms is less than <code>length(term)*0.5</code>
50     * @param prefixLength length of common (non-fuzzy) prefix
51     * @throws IllegalArgumentException if minimumSimilarity is &gt;= 1 or &lt; 0
52     * or if prefixLength &lt; 0
53     */
54    public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException {
55      super(term);
56      
57      if (minimumSimilarity >= 1.0f)
58        throw new IllegalArgumentException("minimumSimilarity >= 1");
59      else if (minimumSimilarity < 0.0f)
60        throw new IllegalArgumentException("minimumSimilarity < 0");
61      if (prefixLength < 0)
62        throw new IllegalArgumentException("prefixLength < 0");
63      
64      this.minimumSimilarity = minimumSimilarity;
65      this.prefixLength = prefixLength;
66    }
67    
68    /**
69     * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
70     */
71    public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
72        this(term, minimumSimilarity, defaultPrefixLength);
73    }
74  
75    /**
76     * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
77     */
78    public FuzzyQuery(Term term) {
79      this(term, defaultMinSimilarity, defaultPrefixLength);
80    }
81    
82    /**
83     * Returns the minimum similarity that is required for this query to match.
84     * @return float value between 0.0 and 1.0
85     */
86    public float getMinSimilarity() {
87      return minimumSimilarity;
88    }
89      
90    /**
91     * Returns the non-fuzzy prefix length. This is the number of characters at the start
92     * of a term that must be identical (not fuzzy) to the query term if the query
93     * is to match that term. 
94     */
95    public int getPrefixLength() {
96      return prefixLength;
97    }
98  
99    protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
100     return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
101   }
102   
103   public Query rewrite(IndexReader reader) throws IOException {
104     FilteredTermEnum enumerator = getEnum(reader);
105     int maxClauseCount = BooleanQuery.getMaxClauseCount();
106     ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
107     ScoreTerm reusableST = null;
108 
109     try {
110       do {
111         float score = 0.0f;
112         Term t = enumerator.term();
113         if (t != null) {
114           score = enumerator.difference();
115           if (reusableST == null) {
116             reusableST = new ScoreTerm(t, score);
117           } else if (score >= reusableST.score) {
118             // reusableST holds the last "rejected" entry, so, if
119             // this new score is not better than that, there's no
120             // need to try inserting it
121             reusableST.score = score;
122             reusableST.term = t;
123           } else {
124             continue;
125           }
126 
127           reusableST = (ScoreTerm) stQueue.insertWithOverflow(reusableST);
128         }
129       } while (enumerator.next());
130     } finally {
131       enumerator.close();
132     }
133     
134     BooleanQuery query = new BooleanQuery(true);
135     int size = stQueue.size();
136     for(int i = 0; i < size; i++){
137       ScoreTerm st = (ScoreTerm) stQueue.pop();
138       TermQuery tq = new TermQuery(st.term);      // found a match
139       tq.setBoost(getBoost() * st.score); // set the boost
140       query.add(tq, BooleanClause.Occur.SHOULD);          // add to query
141     }
142 
143     return query;
144   }
145     
146   public String toString(String field) {
147     StringBuffer buffer = new StringBuffer();
148     Term term = getTerm();
149     if (!term.field().equals(field)) {
150         buffer.append(term.field());
151         buffer.append(":");
152     }
153     buffer.append(term.text());
154     buffer.append('~');
155     buffer.append(Float.toString(minimumSimilarity));
156     buffer.append(ToStringUtils.boost(getBoost()));
157     return buffer.toString();
158   }
159   
160   protected static class ScoreTerm {
161     public Term term;
162     public float score;
163     
164     public ScoreTerm(Term term, float score){
165       this.term = term;
166       this.score = score;
167     }
168   }
169   
170   protected static class ScoreTermQueue extends PriorityQueue {
171     
172     public ScoreTermQueue(int size){
173       initialize(size);
174     }
175     
176     /* (non-Javadoc)
177      * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
178      */
179     protected boolean lessThan(Object a, Object b) {
180       ScoreTerm termA = (ScoreTerm)a;
181       ScoreTerm termB = (ScoreTerm)b;
182       if (termA.score == termB.score)
183         return termA.term.compareTo(termB.term) > 0;
184       else
185         return termA.score < termB.score;
186     }
187     
188   }
189 
190   public boolean equals(Object o) {
191     if (this == o) return true;
192     if (!(o instanceof FuzzyQuery)) return false;
193     if (!super.equals(o)) return false;
194 
195     final FuzzyQuery fuzzyQuery = (FuzzyQuery) o;
196 
197     if (minimumSimilarity != fuzzyQuery.minimumSimilarity) return false;
198     if (prefixLength != fuzzyQuery.prefixLength) return false;
199 
200     return true;
201   }
202 
203   public int hashCode() {
204     int result = super.hashCode();
205     result = 29 * result + minimumSimilarity != +0.0f ? Float.floatToIntBits(minimumSimilarity) : 0;
206     result = 29 * result + prefixLength;
207     return result;
208   }
209 }
210