| Similarity.java |
1 package org.apache.lucene.search;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import org.apache.lucene.index.Term;
21 import org.apache.lucene.util.SmallFloat;
22
23 import java.io.IOException;
24 import java.io.Serializable;
25 import java.util.Collection;
26 import java.util.Iterator;
27
28 /** Expert: Scoring API.
29 * <p>Subclasses implement search scoring.
30 *
31 * <p>The score of query <code>q</code> for document <code>d</code> correlates to the
32 * cosine-distance or dot-product between document and query vectors in a
33 * <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
34 * Vector Space Model (VSM) of Information Retrieval</a>.
35 * A document whose vector is closer to the query vector in that model is scored higher.
36 *
37 * The score is computed as follows:
38 *
39 * <P>
40 * <table cellpadding="1" cellspacing="0" border="1" align="center">
41 * <tr><td>
42 * <table cellpadding="1" cellspacing="0" border="0" align="center">
43 * <tr>
44 * <td valign="middle" align="right" rowspan="1">
45 * score(q,d) =
46 * <A HREF="#formula_coord">coord(q,d)</A> ·
47 * <A HREF="#formula_queryNorm">queryNorm(q)</A> ·
48 * </td>
49 * <td valign="bottom" align="center" rowspan="1">
50 * <big><big><big>∑</big></big></big>
51 * </td>
52 * <td valign="middle" align="right" rowspan="1">
53 * <big><big>(</big></big>
54 * <A HREF="#formula_tf">tf(t in d)</A> ·
55 * <A HREF="#formula_idf">idf(t)</A><sup>2</sup> ·
56 * <A HREF="#formula_termBoost">t.getBoost()</A> ·
57 * <A HREF="#formula_norm">norm(t,d)</A>
58 * <big><big>)</big></big>
59 * </td>
60 * </tr>
61 * <tr valigh="top">
62 * <td></td>
63 * <td align="center"><small>t in q</small></td>
64 * <td></td>
65 * </tr>
66 * </table>
67 * </td></tr>
68 * </table>
69 *
70 * <p> where
71 * <ol>
72 * <li>
73 * <A NAME="formula_tf"></A>
74 * <b>tf(t in d)</b>
75 * correlates to the term's <i>frequency</i>,
76 * defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
77 * Documents that have more occurrences of a given term receive a higher score.
78 * The default computation for <i>tf(t in d)</i> in
79 * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
80 *
81 * <br> <br>
82 * <table cellpadding="2" cellspacing="2" border="0" align="center">
83 * <tr>
84 * <td valign="middle" align="right" rowspan="1">
85 * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} =
86 * </td>
87 * <td valign="top" align="center" rowspan="1">
88 * frequency<sup><big>½</big></sup>
89 * </td>
90 * </tr>
91 * </table>
92 * <br> <br>
93 * </li>
94 *
95 * <li>
96 * <A NAME="formula_idf"></A>
97 * <b>idf(t)</b> stands for Inverse Document Frequency. This value
98 * correlates to the inverse of <i>docFreq</i>
99 * (the number of documents in which the term <i>t</i> appears).
100 * This means rarer terms give higher contribution to the total score.
101 * The default computation for <i>idf(t)</i> in
102 * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
103 *
104 * <br> <br>
105 * <table cellpadding="2" cellspacing="2" border="0" align="center">
106 * <tr>
107 * <td valign="middle" align="right">
108 * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)} =
109 * </td>
110 * <td valign="middle" align="center">
111 * 1 + log <big>(</big>
112 * </td>
113 * <td valign="middle" align="center">
114 * <table>
115 * <tr><td align="center"><small>numDocs</small></td></tr>
116 * <tr><td align="center">–––––––––</td></tr>
117 * <tr><td align="center"><small>docFreq+1</small></td></tr>
118 * </table>
119 * </td>
120 * <td valign="middle" align="center">
121 * <big>)</big>
122 * </td>
123 * </tr>
124 * </table>
125 * <br> <br>
126 * </li>
127 *
128 * <li>
129 * <A NAME="formula_coord"></A>
130 * <b>coord(q,d)</b>
131 * is a score factor based on how many of the query terms are found in the specified document.
132 * Typically, a document that contains more of the query's terms will receive a higher score
133 * than another document with fewer query terms.
134 * This is a search time factor computed in
135 * {@link #coord(int, int) coord(q,d)}
136 * by the Similarity in effect at search time.
137 * <br> <br>
138 * </li>
139 *
140 * <li><b>
141 * <A NAME="formula_queryNorm"></A>
142 * queryNorm(q)
143 * </b>
144 * is a normalizing factor used to make scores between queries comparable.
145 * This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
146 * but rather just attempts to make scores from different queries (or even different indexes) comparable.
147 * This is a search time factor computed by the Similarity in effect at search time.
148 *
149 * The default computation in
150 * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) DefaultSimilarity}
151 * is:
152 * <br> <br>
153 * <table cellpadding="1" cellspacing="0" border="0" align="center">
154 * <tr>
155 * <td valign="middle" align="right" rowspan="1">
156 * queryNorm(q) =
157 * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) queryNorm(sumOfSquaredWeights)}
158 * =
159 * </td>
160 * <td valign="middle" align="center" rowspan="1">
161 * <table>
162 * <tr><td align="center"><big>1</big></td></tr>
163 * <tr><td align="center"><big>
164 * ––––––––––––––
165 * </big></td></tr>
166 * <tr><td align="center">sumOfSquaredWeights<sup><big>½</big></sup></td></tr>
167 * </table>
168 * </td>
169 * </tr>
170 * </table>
171 * <br> <br>
172 *
173 * The sum of squared weights (of the query terms) is
174 * computed by the query {@link org.apache.lucene.search.Weight} object.
175 * For example, a {@link org.apache.lucene.search.BooleanQuery boolean query}
176 * computes this value as:
177 *
178 * <br> <br>
179 * <table cellpadding="1" cellspacing="0" border="0"n align="center">
180 * <tr>
181 * <td valign="middle" align="right" rowspan="1">
182 * {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights} =
183 * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
184 * ·
185 * </td>
186 * <td valign="bottom" align="center" rowspan="1">
187 * <big><big><big>∑</big></big></big>
188 * </td>
189 * <td valign="middle" align="right" rowspan="1">
190 * <big><big>(</big></big>
191 * <A HREF="#formula_idf">idf(t)</A> ·
192 * <A HREF="#formula_termBoost">t.getBoost()</A>
193 * <big><big>) <sup>2</sup> </big></big>
194 * </td>
195 * </tr>
196 * <tr valigh="top">
197 * <td></td>
198 * <td align="center"><small>t in q</small></td>
199 * <td></td>
200 * </tr>
201 * </table>
202 * <br> <br>
203 *
204 * </li>
205 *
206 * <li>
207 * <A NAME="formula_termBoost"></A>
208 * <b>t.getBoost()</b>
209 * is a search time boost of term <i>t</i> in the query <i>q</i> as
210 * specified in the query text
211 * (see <A HREF="../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
212 * or as set by application calls to
213 * {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
214 * Notice that there is really no direct API for accessing a boost of one term in a multi term query,
215 * but rather multi terms are represented in a query as multi
216 * {@link org.apache.lucene.search.TermQuery TermQuery} objects,
217 * and so the boost of a term in the query is accessible by calling the sub-query
218 * {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
219 * <br> <br>
220 * </li>
221 *
222 * <li>
223 * <A NAME="formula_norm"></A>
224 * <b>norm(t,d)</b> encapsulates a few (indexing time) boost and length factors:
225 *
226 * <ul>
227 * <li><b>Document boost</b> - set by calling
228 * {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
229 * before adding the document to the index.
230 * </li>
231 * <li><b>Field boost</b> - set by calling
232 * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
233 * before adding the field to a document.
234 * </li>
235 * <li>{@link #lengthNorm(String, int) <b>lengthNorm</b>(field)} - computed
236 * when the document is added to the index in accordance with the number of tokens
237 * of this field in the document, so that shorter fields contribute more to the score.
238 * LengthNorm is computed by the Similarity class in effect at indexing.
239 * </li>
240 * </ul>
241 *
242 * <p>
243 * When a document is added to the index, all the above factors are multiplied.
244 * If the document has multiple fields with the same name, all their boosts are multiplied together:
245 *
246 * <br> <br>
247 * <table cellpadding="1" cellspacing="0" border="0"n align="center">
248 * <tr>
249 * <td valign="middle" align="right" rowspan="1">
250 * norm(t,d) =
251 * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
252 * ·
253 * {@link #lengthNorm(String, int) lengthNorm(field)}
254 * ·
255 * </td>
256 * <td valign="bottom" align="center" rowspan="1">
257 * <big><big><big>∏</big></big></big>
258 * </td>
259 * <td valign="middle" align="right" rowspan="1">
260 * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
261 * </td>
262 * </tr>
263 * <tr valigh="top">
264 * <td></td>
265 * <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
266 * <td></td>
267 * </tr>
268 * </table>
269 * <br> <br>
270 * However the resulted <i>norm</i> value is {@link #encodeNorm(float) encoded} as a single byte
271 * before being stored.
272 * At search time, the norm byte value is read from the index
273 * {@link org.apache.lucene.store.Directory directory} and
274 * {@link #decodeNorm(byte) decoded} back to a float <i>norm</i> value.
275 * This encoding/decoding, while reducing index size, comes with the price of
276 * precision loss - it is not guaranteed that decode(encode(x)) = x.
277 * For instance, decode(encode(0.89)) = 0.75.
278 * Also notice that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
279 * using a different {@link Similarity} for search.
280 * <br> <br>
281 * </li>
282 * </ol>
283 *
284 * @see #setDefault(Similarity)
285 * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
286 * @see Searcher#setSimilarity(Similarity)
287 */
288 public abstract class Similarity implements Serializable {
289 /** The Similarity implementation used by default. */
290 private static Similarity defaultImpl = new DefaultSimilarity();
291
292 /** Set the default Similarity implementation used by indexing and search
293 * code.
294 *
295 * @see Searcher#setSimilarity(Similarity)
296 * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
297 */
298 public static void setDefault(Similarity similarity) {
299 Similarity.defaultImpl = similarity;
300 }
301
302 /** Return the default Similarity implementation used by indexing and search
303 * code.
304 *
305 * <p>This is initially an instance of {@link DefaultSimilarity}.
306 *
307 * @see Searcher#setSimilarity(Similarity)
308 * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
309 */
310 public static Similarity getDefault() {
311 return Similarity.defaultImpl;
312 }
313
314 /** Cache of decoded bytes. */
315 private static final float[] NORM_TABLE = new float[256];
316
317 static {
318 for (int i = 0; i < 256; i++)
319 NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
320 }
321
322 /** Decodes a normalization factor stored in an index.
323 * @see #encodeNorm(float)
324 */
325 public static float decodeNorm(byte b) {
326 return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
327 }
328
329 /** Returns a table for decoding normalization bytes.
330 * @see #encodeNorm(float)
331 */
332 public static float[] getNormDecoder() {
333 return NORM_TABLE;
334 }
335
336 /** Computes the normalization value for a field given the total number of
337 * terms contained in a field. These values, together with field boosts, are
338 * stored in an index and multipled into scores for hits on each field by the
339 * search code.
340 *
341 * <p>Matches in longer fields are less precise, so implementations of this
342 * method usually return smaller values when <code>numTokens</code> is large,
343 * and larger values when <code>numTokens</code> is small.
344 *
345 * <p>That these values are computed under
346 * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)}
347 * and stored then using
348 * {@link #encodeNorm(float)}.
349 * Thus they have limited precision, and documents
350 * must be re-indexed if this method is altered.
351 *
352 * @param fieldName the name of the field
353 * @param numTokens the total number of tokens contained in fields named
354 * <i>fieldName</i> of <i>doc</i>.
355 * @return a normalization factor for hits on this field of this document
356 *
357 * @see org.apache.lucene.document.Field#setBoost(float)
358 */
359 public abstract float lengthNorm(String fieldName, int numTokens);
360
361 /** Computes the normalization value for a query given the sum of the squared
362 * weights of each of the query terms. This value is then multipled into the
363 * weight of each query term.
364 *
365 * <p>This does not affect ranking, but rather just attempts to make scores
366 * from different queries comparable.
367 *
368 * @param sumOfSquaredWeights the sum of the squares of query term weights
369 * @return a normalization factor for query weights
370 */
371 public abstract float queryNorm(float sumOfSquaredWeights);
372
373 /** Encodes a normalization factor for storage in an index.
374 *
375 * <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
376 * the zero-exponent point at 15, thus
377 * representing values from around 7x10^9 to 2x10^-9 with about one
378 * significant decimal digit of accuracy. Zero is also represented.
379 * Negative numbers are rounded up to zero. Values too large to represent
380 * are rounded down to the largest representable value. Positive values too
381 * small to represent are rounded up to the smallest positive representable
382 * value.
383 *
384 * @see org.apache.lucene.document.Field#setBoost(float)
385 * @see org.apache.lucene.util.SmallFloat
386 */
387 public static byte encodeNorm(float f) {
388 return SmallFloat.floatToByte315(f);
389 }
390
391
392 /** Computes a score factor based on a term or phrase's frequency in a
393 * document. This value is multiplied by the {@link #idf(Term, Searcher)}
394 * factor for each term in the query and these products are then summed to
395 * form the initial score for a document.
396 *
397 * <p>Terms and phrases repeated in a document indicate the topic of the
398 * document, so implementations of this method usually return larger values
399 * when <code>freq</code> is large, and smaller values when <code>freq</code>
400 * is small.
401 *
402 * <p>The default implementation calls {@link #tf(float)}.
403 *
404 * @param freq the frequency of a term within a document
405 * @return a score factor based on a term's within-document frequency
406 */
407 public float tf(int freq) {
408 return tf((float)freq);
409 }
410
411 /** Computes the amount of a sloppy phrase match, based on an edit distance.
412 * This value is summed for each sloppy phrase match in a document to form
413 * the frequency that is passed to {@link #tf(float)}.
414 *
415 * <p>A phrase match with a small edit distance to a document passage more
416 * closely matches the document, so implementations of this method usually
417 * return larger values when the edit distance is small and smaller values
418 * when it is large.
419 *
420 * @see PhraseQuery#setSlop(int)
421 * @param distance the edit distance of this sloppy phrase match
422 * @return the frequency increment for this match
423 */
424 public abstract float sloppyFreq(int distance);
425
426 /** Computes a score factor based on a term or phrase's frequency in a
427 * document. This value is multiplied by the {@link #idf(Term, Searcher)}
428 * factor for each term in the query and these products are then summed to
429 * form the initial score for a document.
430 *
431 * <p>Terms and phrases repeated in a document indicate the topic of the
432 * document, so implementations of this method usually return larger values
433 * when <code>freq</code> is large, and smaller values when <code>freq</code>
434 * is small.
435 *
436 * @param freq the frequency of a term within a document
437 * @return a score factor based on a term's within-document frequency
438 */
439 public abstract float tf(float freq);
440
441 /** Computes a score factor for a simple term.
442 *
443 * <p>The default implementation is:<pre>
444 * return idf(searcher.docFreq(term), searcher.maxDoc());
445 * </pre>
446 *
447 * Note that {@link Searcher#maxDoc()} is used instead of
448 * {@link org.apache.lucene.index.IndexReader#numDocs()} because it is proportional to
449 * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
450 * so is the other, and in the same direction.
451 *
452 * @param term the term in question
453 * @param searcher the document collection being searched
454 * @return a score factor for the term
455 */
456 public float idf(Term term, Searcher searcher) throws IOException {
457 return idf(searcher.docFreq(term), searcher.maxDoc());
458 }
459
460 /** Computes a score factor for a phrase.
461 *
462 * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
463 * for each term in the phrase.
464 *
465 * @param terms the terms in the phrase
466 * @param searcher the document collection being searched
467 * @return a score factor for the phrase
468 */
469 public float idf(Collection terms, Searcher searcher) throws IOException {
470 float idf = 0.0f;
471 Iterator i = terms.iterator();
472 while (i.hasNext()) {
473 idf += idf((Term)i.next(), searcher);
474 }
475 return idf;
476 }
477
478 /** Computes a score factor based on a term's document frequency (the number
479 * of documents which contain the term). This value is multiplied by the
480 * {@link #tf(int)} factor for each term in the query and these products are
481 * then summed to form the initial score for a document.
482 *
483 * <p>Terms that occur in fewer documents are better indicators of topic, so
484 * implementations of this method usually return larger values for rare terms,
485 * and smaller values for common terms.
486 *
487 * @param docFreq the number of documents which contain the term
488 * @param numDocs the total number of documents in the collection
489 * @return a score factor based on the term's document frequency
490 */
491 public abstract float idf(int docFreq, int numDocs);
492
493 /** Computes a score factor based on the fraction of all query terms that a
494 * document contains. This value is multiplied into scores.
495 *
496 * <p>The presence of a large portion of the query terms indicates a better
497 * match with the query, so implementations of this method usually return
498 * larger values when the ratio between these parameters is large and smaller
499 * values when the ratio between them is small.
500 *
501 * @param overlap the number of query terms matched in the document
502 * @param maxOverlap the total number of terms in the query
503 * @return a score factor based on term overlap with the query
504 */
505 public abstract float coord(int overlap, int maxOverlap);
506
507
508 /**
509 * Calculate a scoring factor based on the data in the payload. Overriding implementations
510 * are responsible for interpreting what is in the payload. Lucene makes no assumptions about
511 * what is in the byte array.
512 * <p>
513 * The default implementation returns 1.
514 *
515 * @param fieldName The fieldName of the term this payload belongs to
516 * @param payload The payload byte array to be scored
517 * @param offset The offset into the payload array
518 * @param length The length in the array
519 * @return An implementation dependent float to be used as a scoring factor
520 */
521 public float scorePayload(String fieldName, byte [] payload, int offset, int length)
522 {
523 //Do nothing
524 return 1;
525 }
526 }
527 | Similarity.java |