org.apache.lucene.search.Similarity (Java2HTML)

1   package org.apache.lucene.search;
2   
3   /**
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import org.apache.lucene.index.Term;
21  import org.apache.lucene.util.SmallFloat;
22  
23  import java.io.IOException;
24  import java.io.Serializable;
25  import java.util.Collection;
26  import java.util.Iterator;
27  
28  /** Expert: Scoring API.
29   * <p>Subclasses implement search scoring.
30   *
31   * <p>The score of query <code>q</code> for document <code>d</code> correlates to the
32   * cosine-distance or dot-product between document and query vectors in a
33   * <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
34   * Vector Space Model (VSM) of Information Retrieval</a>.
35   * A document whose vector is closer to the query vector in that model is scored higher.
36   *
37   * The score is computed as follows:
38   *
39   * <P>
40   * <table cellpadding="1" cellspacing="0" border="1" align="center">
41   * <tr><td>
42   * <table cellpadding="1" cellspacing="0" border="0" align="center">
43   *  <tr>
44   *    <td valign="middle" align="right" rowspan="1">
45   *      score(q,d) &nbsp; = &nbsp;
46   *      <A HREF="#formula_coord">coord(q,d)</A> &nbsp;&middot;&nbsp;
47   *      <A HREF="#formula_queryNorm">queryNorm(q)</A> &nbsp;&middot;&nbsp;
48   *    </td>
49   *    <td valign="bottom" align="center" rowspan="1">
50   *      <big><big><big>&sum;</big></big></big>
51   *    </td>
52   *    <td valign="middle" align="right" rowspan="1">
53   *      <big><big>(</big></big>
54   *      <A HREF="#formula_tf">tf(t in d)</A> &nbsp;&middot;&nbsp;
55   *      <A HREF="#formula_idf">idf(t)</A><sup>2</sup> &nbsp;&middot;&nbsp;
56   *      <A HREF="#formula_termBoost">t.getBoost()</A>&nbsp;&middot;&nbsp;
57   *      <A HREF="#formula_norm">norm(t,d)</A>
58   *      <big><big>)</big></big>
59   *    </td>
60   *  </tr>
61   *  <tr valigh="top">
62   *   <td></td>
63   *   <td align="center"><small>t in q</small></td>
64   *   <td></td>
65   *  </tr>
66   * </table>
67   * </td></tr>
68   * </table>
69   *
70   * <p> where
71   * <ol>
72   *    <li>
73   *      <A NAME="formula_tf"></A>
74   *      <b>tf(t in d)</b>
75   *      correlates to the term's <i>frequency</i>,
76   *      defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
77   *      Documents that have more occurrences of a given term receive a higher score.
78   *      The default computation for <i>tf(t in d)</i> in
79   *      {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
80   *
81   *      <br>&nbsp;<br>
82   *      <table cellpadding="2" cellspacing="2" border="0" align="center">
83   *        <tr>
84   *          <td valign="middle" align="right" rowspan="1">
85   *            {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} &nbsp; = &nbsp;
86   *          </td>
87   *          <td valign="top" align="center" rowspan="1">
88   *               frequency<sup><big>&frac12;</big></sup>
89   *          </td>
90   *        </tr>
91   *      </table>
92   *      <br>&nbsp;<br>
93   *    </li>
94   *
95   *    <li>
96   *      <A NAME="formula_idf"></A>
97   *      <b>idf(t)</b> stands for Inverse Document Frequency. This value
98   *      correlates to the inverse of <i>docFreq</i>
99   *      (the number of documents in which the term <i>t</i> appears).
100  *      This means rarer terms give higher contribution to the total score.
101  *      The default computation for <i>idf(t)</i> in
102  *      {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
103  *
104  *      <br>&nbsp;<br>
105  *      <table cellpadding="2" cellspacing="2" border="0" align="center">
106  *        <tr>
107  *          <td valign="middle" align="right">
108  *            {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
109  *          </td>
110  *          <td valign="middle" align="center">
111  *            1 + log <big>(</big>
112  *          </td>
113  *          <td valign="middle" align="center">
114  *            <table>
115  *               <tr><td align="center"><small>numDocs</small></td></tr>
116  *               <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
117  *               <tr><td align="center"><small>docFreq+1</small></td></tr>
118  *            </table>
119  *          </td>
120  *          <td valign="middle" align="center">
121  *            <big>)</big>
122  *          </td>
123  *        </tr>
124  *      </table>
125  *      <br>&nbsp;<br>
126  *    </li>
127  *
128  *    <li>
129  *      <A NAME="formula_coord"></A>
130  *      <b>coord(q,d)</b>
131  *      is a score factor based on how many of the query terms are found in the specified document.
132  *      Typically, a document that contains more of the query's terms will receive a higher score
133  *      than another document with fewer query terms.
134  *      This is a search time factor computed in
135  *      {@link #coord(int, int) coord(q,d)}
136  *      by the Similarity in effect at search time.
137  *      <br>&nbsp;<br>
138  *    </li>
139  *
140  *    <li><b>
141  *      <A NAME="formula_queryNorm"></A>
142  *      queryNorm(q)
143  *      </b>
144  *      is a normalizing factor used to make scores between queries comparable.
145  *      This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
146  *      but rather just attempts to make scores from different queries (or even different indexes) comparable.
147  *      This is a search time factor computed by the Similarity in effect at search time.
148  *
149  *      The default computation in
150  *      {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) DefaultSimilarity}
151  *      is:
152  *      <br>&nbsp;<br>
153  *      <table cellpadding="1" cellspacing="0" border="0" align="center">
154  *        <tr>
155  *          <td valign="middle" align="right" rowspan="1">
156  *            queryNorm(q)  &nbsp; = &nbsp;
157  *            {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) queryNorm(sumOfSquaredWeights)}
158  *            &nbsp; = &nbsp;
159  *          </td>
160  *          <td valign="middle" align="center" rowspan="1">
161  *            <table>
162  *               <tr><td align="center"><big>1</big></td></tr>
163  *               <tr><td align="center"><big>
164  *                  &ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;
165  *               </big></td></tr>
166  *               <tr><td align="center">sumOfSquaredWeights<sup><big>&frac12;</big></sup></td></tr>
167  *            </table>
168  *          </td>
169  *        </tr>
170  *      </table>
171  *      <br>&nbsp;<br>
172  *
173  *      The sum of squared weights (of the query terms) is
174  *      computed by the query {@link org.apache.lucene.search.Weight} object.
175  *      For example, a {@link org.apache.lucene.search.BooleanQuery boolean query}
176  *      computes this value as:
177  *
178  *      <br>&nbsp;<br>
179  *      <table cellpadding="1" cellspacing="0" border="0"n align="center">
180  *        <tr>
181  *          <td valign="middle" align="right" rowspan="1">
182  *            {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights} &nbsp; = &nbsp;
183  *            {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
184  *            &nbsp;&middot;&nbsp;
185  *          </td>
186  *          <td valign="bottom" align="center" rowspan="1">
187  *            <big><big><big>&sum;</big></big></big>
188  *          </td>
189  *          <td valign="middle" align="right" rowspan="1">
190  *            <big><big>(</big></big>
191  *            <A HREF="#formula_idf">idf(t)</A> &nbsp;&middot;&nbsp;
192  *            <A HREF="#formula_termBoost">t.getBoost()</A>
193  *            <big><big>) <sup>2</sup> </big></big>
194  *          </td>
195  *        </tr>
196  *        <tr valigh="top">
197  *          <td></td>
198  *          <td align="center"><small>t in q</small></td>
199  *          <td></td>
200  *        </tr>
201  *      </table>
202  *      <br>&nbsp;<br>
203  *
204  *    </li>
205  *
206  *    <li>
207  *      <A NAME="formula_termBoost"></A>
208  *      <b>t.getBoost()</b>
209  *      is a search time boost of term <i>t</i> in the query <i>q</i> as
210  *      specified in the query text
211  *      (see <A HREF="../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
212  *      or as set by application calls to
213  *      {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
214  *      Notice that there is really no direct API for accessing a boost of one term in a multi term query,
215  *      but rather multi terms are represented in a query as multi
216  *      {@link org.apache.lucene.search.TermQuery TermQuery} objects,
217  *      and so the boost of a term in the query is accessible by calling the sub-query
218  *      {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
219  *      <br>&nbsp;<br>
220  *    </li>
221  *
222  *    <li>
223  *      <A NAME="formula_norm"></A>
224  *      <b>norm(t,d)</b> encapsulates a few (indexing time) boost and length factors:
225  *
226  *      <ul>
227  *        <li><b>Document boost</b> - set by calling
228  *        {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
229  *        before adding the document to the index.
230  *        </li>
231  *        <li><b>Field boost</b> - set by calling
232  *        {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
233  *        before adding the field to a document.
234  *        </li>
235  *        <li>{@link #lengthNorm(String, int) <b>lengthNorm</b>(field)} - computed
236  *        when the document is added to the index in accordance with the number of tokens
237  *        of this field in the document, so that shorter fields contribute more to the score.
238  *        LengthNorm is computed by the Similarity class in effect at indexing.
239  *        </li>
240  *      </ul>
241  *
242  *      <p>
243  *      When a document is added to the index, all the above factors are multiplied.
244  *      If the document has multiple fields with the same name, all their boosts are multiplied together:
245  *
246  *      <br>&nbsp;<br>
247  *      <table cellpadding="1" cellspacing="0" border="0"n align="center">
248  *        <tr>
249  *          <td valign="middle" align="right" rowspan="1">
250  *            norm(t,d) &nbsp; = &nbsp;
251  *            {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
252  *            &nbsp;&middot;&nbsp;
253  *            {@link #lengthNorm(String, int) lengthNorm(field)}
254  *            &nbsp;&middot;&nbsp;
255  *          </td>
256  *          <td valign="bottom" align="center" rowspan="1">
257  *            <big><big><big>&prod;</big></big></big>
258  *          </td>
259  *          <td valign="middle" align="right" rowspan="1">
260  *            {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
261  *          </td>
262  *        </tr>
263  *        <tr valigh="top">
264  *          <td></td>
265  *          <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
266  *          <td></td>
267  *        </tr>
268  *      </table>
269  *      <br>&nbsp;<br>
270  *      However the resulted <i>norm</i> value is {@link #encodeNorm(float) encoded} as a single byte
271  *      before being stored.
272  *      At search time, the norm byte value is read from the index
273  *      {@link org.apache.lucene.store.Directory directory} and
274  *      {@link #decodeNorm(byte) decoded} back to a float <i>norm</i> value.
275  *      This encoding/decoding, while reducing index size, comes with the price of
276  *      precision loss - it is not guaranteed that decode(encode(x)) = x.
277  *      For instance, decode(encode(0.89)) = 0.75.
278  *      Also notice that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
279  *      using a different {@link Similarity} for search.
280  *      <br>&nbsp;<br>
281  *    </li>
282  * </ol>
283  *
284  * @see #setDefault(Similarity)
285  * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
286  * @see Searcher#setSimilarity(Similarity)
287  */
288 public abstract class Similarity implements Serializable {
289   /** The Similarity implementation used by default. */
290   private static Similarity defaultImpl = new DefaultSimilarity();
291 
292   /** Set the default Similarity implementation used by indexing and search
293    * code.
294    *
295    * @see Searcher#setSimilarity(Similarity)
296    * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
297    */
298   public static void setDefault(Similarity similarity) {
299     Similarity.defaultImpl = similarity;
300   }
301 
302   /** Return the default Similarity implementation used by indexing and search
303    * code.
304    *
305    * <p>This is initially an instance of {@link DefaultSimilarity}.
306    *
307    * @see Searcher#setSimilarity(Similarity)
308    * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
309    */
310   public static Similarity getDefault() {
311     return Similarity.defaultImpl;
312   }
313 
314   /** Cache of decoded bytes. */
315   private static final float[] NORM_TABLE = new float[256];
316 
317   static {
318     for (int i = 0; i < 256; i++)
319       NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
320   }
321 
322   /** Decodes a normalization factor stored in an index.
323    * @see #encodeNorm(float)
324    */
325   public static float decodeNorm(byte b) {
326     return NORM_TABLE[b & 0xFF];  // & 0xFF maps negative bytes to positive above 127
327   }
328 
329   /** Returns a table for decoding normalization bytes.
330    * @see #encodeNorm(float)
331    */
332   public static float[] getNormDecoder() {
333     return NORM_TABLE;
334   }
335 
336   /** Computes the normalization value for a field given the total number of
337    * terms contained in a field.  These values, together with field boosts, are
338    * stored in an index and multipled into scores for hits on each field by the
339    * search code.
340    *
341    * <p>Matches in longer fields are less precise, so implementations of this
342    * method usually return smaller values when <code>numTokens</code> is large,
343    * and larger values when <code>numTokens</code> is small.
344    *
345    * <p>That these values are computed under 
346    * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} 
347    * and stored then using
348    * {@link #encodeNorm(float)}.  
349    * Thus they have limited precision, and documents
350    * must be re-indexed if this method is altered.
351    *
352    * @param fieldName the name of the field
353    * @param numTokens the total number of tokens contained in fields named
354    * <i>fieldName</i> of <i>doc</i>.
355    * @return a normalization factor for hits on this field of this document
356    *
357    * @see org.apache.lucene.document.Field#setBoost(float)
358    */
359   public abstract float lengthNorm(String fieldName, int numTokens);
360 
361   /** Computes the normalization value for a query given the sum of the squared
362    * weights of each of the query terms.  This value is then multipled into the
363    * weight of each query term.
364    *
365    * <p>This does not affect ranking, but rather just attempts to make scores
366    * from different queries comparable.
367    *
368    * @param sumOfSquaredWeights the sum of the squares of query term weights
369    * @return a normalization factor for query weights
370    */
371   public abstract float queryNorm(float sumOfSquaredWeights);
372 
373   /** Encodes a normalization factor for storage in an index.
374    *
375    * <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
376    * the zero-exponent point at 15, thus
377    * representing values from around 7x10^9 to 2x10^-9 with about one
378    * significant decimal digit of accuracy.  Zero is also represented.
379    * Negative numbers are rounded up to zero.  Values too large to represent
380    * are rounded down to the largest representable value.  Positive values too
381    * small to represent are rounded up to the smallest positive representable
382    * value.
383    *
384    * @see org.apache.lucene.document.Field#setBoost(float)
385    * @see org.apache.lucene.util.SmallFloat
386    */
387   public static byte encodeNorm(float f) {
388     return SmallFloat.floatToByte315(f);
389   }
390 
391 
392   /** Computes a score factor based on a term or phrase's frequency in a
393    * document.  This value is multiplied by the {@link #idf(Term, Searcher)}
394    * factor for each term in the query and these products are then summed to
395    * form the initial score for a document.
396    *
397    * <p>Terms and phrases repeated in a document indicate the topic of the
398    * document, so implementations of this method usually return larger values
399    * when <code>freq</code> is large, and smaller values when <code>freq</code>
400    * is small.
401    *
402    * <p>The default implementation calls {@link #tf(float)}.
403    *
404    * @param freq the frequency of a term within a document
405    * @return a score factor based on a term's within-document frequency
406    */
407   public float tf(int freq) {
408     return tf((float)freq);
409   }
410 
411   /** Computes the amount of a sloppy phrase match, based on an edit distance.
412    * This value is summed for each sloppy phrase match in a document to form
413    * the frequency that is passed to {@link #tf(float)}.
414    *
415    * <p>A phrase match with a small edit distance to a document passage more
416    * closely matches the document, so implementations of this method usually
417    * return larger values when the edit distance is small and smaller values
418    * when it is large.
419    *
420    * @see PhraseQuery#setSlop(int)
421    * @param distance the edit distance of this sloppy phrase match
422    * @return the frequency increment for this match
423    */
424   public abstract float sloppyFreq(int distance);
425 
426   /** Computes a score factor based on a term or phrase's frequency in a
427    * document.  This value is multiplied by the {@link #idf(Term, Searcher)}
428    * factor for each term in the query and these products are then summed to
429    * form the initial score for a document.
430    *
431    * <p>Terms and phrases repeated in a document indicate the topic of the
432    * document, so implementations of this method usually return larger values
433    * when <code>freq</code> is large, and smaller values when <code>freq</code>
434    * is small.
435    *
436    * @param freq the frequency of a term within a document
437    * @return a score factor based on a term's within-document frequency
438    */
439   public abstract float tf(float freq);
440 
441   /** Computes a score factor for a simple term.
442    *
443    * <p>The default implementation is:<pre>
444    *   return idf(searcher.docFreq(term), searcher.maxDoc());
445    * </pre>
446    *
447    * Note that {@link Searcher#maxDoc()} is used instead of
448    * {@link org.apache.lucene.index.IndexReader#numDocs()} because it is proportional to
449    * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
450    * so is the other, and in the same direction.
451    *
452    * @param term the term in question
453    * @param searcher the document collection being searched
454    * @return a score factor for the term
455    */
456   public float idf(Term term, Searcher searcher) throws IOException {
457     return idf(searcher.docFreq(term), searcher.maxDoc());
458   }
459 
460   /** Computes a score factor for a phrase.
461    *
462    * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
463    * for each term in the phrase.
464    *
465    * @param terms the terms in the phrase
466    * @param searcher the document collection being searched
467    * @return a score factor for the phrase
468    */
469   public float idf(Collection terms, Searcher searcher) throws IOException {
470     float idf = 0.0f;
471     Iterator i = terms.iterator();
472     while (i.hasNext()) {
473       idf += idf((Term)i.next(), searcher);
474     }
475     return idf;
476   }
477 
478   /** Computes a score factor based on a term's document frequency (the number
479    * of documents which contain the term).  This value is multiplied by the
480    * {@link #tf(int)} factor for each term in the query and these products are
481    * then summed to form the initial score for a document.
482    *
483    * <p>Terms that occur in fewer documents are better indicators of topic, so
484    * implementations of this method usually return larger values for rare terms,
485    * and smaller values for common terms.
486    *
487    * @param docFreq the number of documents which contain the term
488    * @param numDocs the total number of documents in the collection
489    * @return a score factor based on the term's document frequency
490    */
491   public abstract float idf(int docFreq, int numDocs);
492 
493   /** Computes a score factor based on the fraction of all query terms that a
494    * document contains.  This value is multiplied into scores.
495    *
496    * <p>The presence of a large portion of the query terms indicates a better
497    * match with the query, so implementations of this method usually return
498    * larger values when the ratio between these parameters is large and smaller
499    * values when the ratio between them is small.
500    *
501    * @param overlap the number of query terms matched in the document
502    * @param maxOverlap the total number of terms in the query
503    * @return a score factor based on term overlap with the query
504    */
505   public abstract float coord(int overlap, int maxOverlap);
506 
507 
508   /**
509    * Calculate a scoring factor based on the data in the payload.  Overriding implementations
510    * are responsible for interpreting what is in the payload.  Lucene makes no assumptions about
511    * what is in the byte array.
512    * <p>
513    * The default implementation returns 1.
514    *
515    * @param fieldName The fieldName of the term this payload belongs to
516    * @param payload The payload byte array to be scored
517    * @param offset The offset into the payload array
518    * @param length The length in the array
519    * @return An implementation dependent float to be used as a scoring factor 
520    */
521   public float scorePayload(String fieldName, byte [] payload, int offset, int length)
522   {
523     //Do nothing
524     return 1;
525   }
526 }
527