org.apache.lucene.search.PhraseQuery (Java2HTML)

1   package org.apache.lucene.search;
2   
3   /**
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.util.Set;
22  import java.util.ArrayList;
23  
24  import org.apache.lucene.index.Term;
25  import org.apache.lucene.index.TermPositions;
26  import org.apache.lucene.index.IndexReader;
27  import org.apache.lucene.util.ToStringUtils;
28  
29  /** A Query that matches documents containing a particular sequence of terms.
30   * A PhraseQuery is built by QueryParser for input like <code>"new york"</code>.
31   * 
32   * <p>This query may be combined with other terms or queries with a {@link BooleanQuery}.
33   */
34  public class PhraseQuery extends Query {
35    private String field;
36    private ArrayList terms = new ArrayList(4);
37    private ArrayList positions = new ArrayList(4);
38    private int maxPosition = 0;
39    private int slop = 0;
40  
41    /** Constructs an empty phrase query. */
42    public PhraseQuery() {}
43  
44    /** Sets the number of other words permitted between words in query phrase.
45      If zero, then this is an exact phrase search.  For larger values this works
46      like a <code>WITHIN</code> or <code>NEAR</code> operator.
47  
48      <p>The slop is in fact an edit-distance, where the units correspond to
49      moves of terms in the query phrase out of position.  For example, to switch
50      the order of two words requires two moves (the first move places the words
51      atop one another), so to permit re-orderings of phrases, the slop must be
52      at least two.
53  
54      <p>More exact matches are scored higher than sloppier matches, thus search
55      results are sorted by exactness.
56  
57      <p>The slop is zero by default, requiring exact matches.*/
58    public void setSlop(int s) { slop = s; }
59    /** Returns the slop.  See setSlop(). */
60    public int getSlop() { return slop; }
61  
62    /**
63     * Adds a term to the end of the query phrase.
64     * The relative position of the term is the one immediately after the last term added.
65     */
66    public void add(Term term) {
67      int position = 0;
68      if(positions.size() > 0)
69          position = ((Integer) positions.get(positions.size()-1)).intValue() + 1;
70  
71      add(term, position);
72    }
73  
74    /**
75     * Adds a term to the end of the query phrase.
76     * The relative position of the term within the phrase is specified explicitly.
77     * This allows e.g. phrases with more than one term at the same position
78     * or phrases with gaps (e.g. in connection with stopwords).
79     * 
80     * @param term
81     * @param position
82     */
83    public void add(Term term, int position) {
84        if (terms.size() == 0)
85            field = term.field();
86        else if (term.field() != field)
87            throw new IllegalArgumentException("All phrase terms must be in the same field: " + term);
88  
89        terms.add(term);
90        positions.add(new Integer(position));
91        if (position > maxPosition) maxPosition = position;
92    }
93  
94    /** Returns the set of terms in this phrase. */
95    public Term[] getTerms() {
96      return (Term[])terms.toArray(new Term[0]);
97    }
98  
99    /**
100    * Returns the relative positions of terms in this phrase.
101    */
102   public int[] getPositions() {
103       int[] result = new int[positions.size()];
104       for(int i = 0; i < positions.size(); i++)
105           result[i] = ((Integer) positions.get(i)).intValue();
106       return result;
107   }
108 
109   private class PhraseWeight implements Weight {
110     private Similarity similarity;
111     private float value;
112     private float idf;
113     private float queryNorm;
114     private float queryWeight;
115 
116     public PhraseWeight(Searcher searcher)
117       throws IOException {
118       this.similarity = getSimilarity(searcher);
119 
120       idf = similarity.idf(terms, searcher);
121     }
122 
123     public String toString() { return "weight(" + PhraseQuery.this + ")"; }
124 
125     public Query getQuery() { return PhraseQuery.this; }
126     public float getValue() { return value; }
127 
128     public float sumOfSquaredWeights() {
129       queryWeight = idf * getBoost();             // compute query weight
130       return queryWeight * queryWeight;           // square it
131     }
132 
133     public void normalize(float queryNorm) {
134       this.queryNorm = queryNorm;
135       queryWeight *= queryNorm;                   // normalize query weight
136       value = queryWeight * idf;                  // idf for document 
137     }
138 
139     public Scorer scorer(IndexReader reader) throws IOException {
140       if (terms.size() == 0)              // optimize zero-term case
141         return null;
142 
143       TermPositions[] tps = new TermPositions[terms.size()];
144       for (int i = 0; i < terms.size(); i++) {
145         TermPositions p = reader.termPositions((Term)terms.get(i));
146         if (p == null)
147           return null;
148         tps[i] = p;
149       }
150 
151       if (slop == 0)                  // optimize exact case
152         return new ExactPhraseScorer(this, tps, getPositions(), similarity,
153                                      reader.norms(field));
154       else
155         return
156           new SloppyPhraseScorer(this, tps, getPositions(), similarity, slop,
157                                  reader.norms(field));
158 
159     }
160 
161     public Explanation explain(IndexReader reader, int doc)
162       throws IOException {
163 
164       Explanation result = new Explanation();
165       result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
166 
167       StringBuffer docFreqs = new StringBuffer();
168       StringBuffer query = new StringBuffer();
169       query.append('\"');
170       for (int i = 0; i < terms.size(); i++) {
171         if (i != 0) {
172           docFreqs.append(" ");
173           query.append(" ");
174         }
175 
176         Term term = (Term)terms.get(i);
177 
178         docFreqs.append(term.text());
179         docFreqs.append("=");
180         docFreqs.append(reader.docFreq(term));
181 
182         query.append(term.text());
183       }
184       query.append('\"');
185 
186       Explanation idfExpl =
187         new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
188 
189       // explain query weight
190       Explanation queryExpl = new Explanation();
191       queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
192 
193       Explanation boostExpl = new Explanation(getBoost(), "boost");
194       if (getBoost() != 1.0f)
195         queryExpl.addDetail(boostExpl);
196       queryExpl.addDetail(idfExpl);
197 
198       Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
199       queryExpl.addDetail(queryNormExpl);
200 
201       queryExpl.setValue(boostExpl.getValue() *
202                          idfExpl.getValue() *
203                          queryNormExpl.getValue());
204 
205       result.addDetail(queryExpl);
206 
207       // explain field weight
208       Explanation fieldExpl = new Explanation();
209       fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+
210                                "), product of:");
211 
212       Explanation tfExpl = scorer(reader).explain(doc);
213       fieldExpl.addDetail(tfExpl);
214       fieldExpl.addDetail(idfExpl);
215 
216       Explanation fieldNormExpl = new Explanation();
217       byte[] fieldNorms = reader.norms(field);
218       float fieldNorm =
219         fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 0.0f;
220       fieldNormExpl.setValue(fieldNorm);
221       fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
222       fieldExpl.addDetail(fieldNormExpl);
223 
224       fieldExpl.setValue(tfExpl.getValue() *
225                          idfExpl.getValue() *
226                          fieldNormExpl.getValue());
227 
228       result.addDetail(fieldExpl);
229 
230       // combine them
231       result.setValue(queryExpl.getValue() * fieldExpl.getValue());
232 
233       if (queryExpl.getValue() == 1.0f)
234         return fieldExpl;
235 
236       return result;
237     }
238   }
239 
240   protected Weight createWeight(Searcher searcher) throws IOException {
241     if (terms.size() == 1) {              // optimize one-term case
242       Term term = (Term)terms.get(0);
243       Query termQuery = new TermQuery(term);
244       termQuery.setBoost(getBoost());
245       return termQuery.createWeight(searcher);
246     }
247     return new PhraseWeight(searcher);
248   }
249 
250   /**
251    * @see org.apache.lucene.search.Query#extractTerms(java.util.Set)
252    */
253   public void extractTerms(Set queryTerms) {
254     queryTerms.addAll(terms);
255   }
256 
257   /** Prints a user-readable version of this query. */
258   public String toString(String f) {
259     StringBuffer buffer = new StringBuffer();
260     if (field != null && !field.equals(f)) {
261       buffer.append(field);
262       buffer.append(":");
263     }
264 
265     buffer.append("\"");
266     String[] pieces = new String[maxPosition + 1];
267     for (int i = 0; i < terms.size(); i++) {
268       int pos = ((Integer)positions.get(i)).intValue();
269       String s = pieces[pos];
270       if (s == null) {
271         s = ((Term)terms.get(i)).text();
272       } else {
273         s = s + "|" + ((Term)terms.get(i)).text();
274       }
275       pieces[pos] = s;
276     }
277     for (int i = 0; i < pieces.length; i++) {
278       if (i > 0) {
279         buffer.append(' ');
280       }
281       String s = pieces[i];
282       if (s == null) {
283         buffer.append('?');
284       } else {
285         buffer.append(s);
286       }
287     }
288     buffer.append("\"");
289 
290     if (slop != 0) {
291       buffer.append("~");
292       buffer.append(slop);
293     }
294 
295     buffer.append(ToStringUtils.boost(getBoost()));
296 
297     return buffer.toString();
298   }
299 
300   /** Returns true iff <code>o</code> is equal to this. */
301   public boolean equals(Object o) {
302     if (!(o instanceof PhraseQuery))
303       return false;
304     PhraseQuery other = (PhraseQuery)o;
305     return (this.getBoost() == other.getBoost())
306       && (this.slop == other.slop)
307       &&  this.terms.equals(other.terms)
308       && this.positions.equals(other.positions);
309   }
310 
311   /** Returns a hash code value for this object.*/
312   public int hashCode() {
313     return Float.floatToIntBits(getBoost())
314       ^ slop
315       ^ terms.hashCode()
316       ^ positions.hashCode();
317   }
318 
319 }
320