| IndexReader.java |
1 package org.apache.lucene.index;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import org.apache.lucene.document.Document;
21 import org.apache.lucene.document.FieldSelector;
22 import org.apache.lucene.search.Similarity;
23 import org.apache.lucene.store.*;
24
25 import java.io.File;
26 import java.io.FileOutputStream;
27 import java.io.IOException;
28 import java.util.Arrays;
29 import java.util.Collection;
30
31 /** IndexReader is an abstract class, providing an interface for accessing an
32 index. Search of an index is done entirely through this abstract interface,
33 so that any subclass which implements it is searchable.
34
35 <p> Concrete subclasses of IndexReader are usually constructed with a call to
36 one of the static <code>open()</code> methods, e.g. {@link #open(String)}.
37
38 <p> For efficiency, in this API documents are often referred to via
39 <i>document numbers</i>, non-negative integers which each name a unique
40 document in the index. These document numbers are ephemeral--they may change
41 as documents are added to and deleted from an index. Clients should thus not
42 rely on a given document having the same number between sessions.
43
44 <p> An IndexReader can be opened on a directory for which an IndexWriter is
45 opened already, but it cannot be used to delete documents from the index then.
46
47 <p>
48 <b>NOTE</b>: for backwards API compatibility, several methods are not listed
49 as abstract, but have no useful implementations in this base class and
50 instead always throw UnsupportedOperationException. Subclasses are
51 strongly encouraged to override these methods, but in many cases may not
52 need to.
53 </p>
54
55 <p>
56
57 <b>NOTE</b>: as of 2.4, it's possible to open a read-only
58 IndexReader using one of the static open methods that
59 accepts the boolean readOnly parameter. Such a reader has
60 better concurrency as it's not necessary to synchronize on
61 the isDeleted method. Currently the default for readOnly
62 is false, meaning if not specified you will get a
63 read/write IndexReader. But in 3.0 this default will
64 change to true, meaning you must explicitly specify false
65 if you want to make changes with the resulting IndexReader.
66 </p>
67
68 @version $Id: IndexReader.java 695510 2008-09-15 15:33:15Z otis $
69 */
70 public abstract class IndexReader {
71
72 // NOTE: in 3.0 this will change to true
73 final static boolean READ_ONLY_DEFAULT = false;
74
75 /**
76 * Constants describing field properties, for example used for
77 * {@link IndexReader#getFieldNames(FieldOption)}.
78 */
79 public static final class FieldOption {
80 private String option;
81 private FieldOption() { }
82 private FieldOption(String option) {
83 this.option = option;
84 }
85 public String toString() {
86 return this.option;
87 }
88 /** All fields */
89 public static final FieldOption ALL = new FieldOption ("ALL");
90 /** All indexed fields */
91 public static final FieldOption INDEXED = new FieldOption ("INDEXED");
92 /** All fields that store payloads */
93 public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS");
94 /** All fields that omit tf */
95 public static final FieldOption OMIT_TF = new FieldOption ("OMIT_TF");
96 /** All fields which are not indexed */
97 public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
98 /** All fields which are indexed with termvectors enabled */
99 public static final FieldOption INDEXED_WITH_TERMVECTOR = new FieldOption ("INDEXED_WITH_TERMVECTOR");
100 /** All fields which are indexed but don't have termvectors enabled */
101 public static final FieldOption INDEXED_NO_TERMVECTOR = new FieldOption ("INDEXED_NO_TERMVECTOR");
102 /** All fields with termvectors enabled. Please note that only standard termvector fields are returned */
103 public static final FieldOption TERMVECTOR = new FieldOption ("TERMVECTOR");
104 /** All fields with termvectors with position values enabled */
105 public static final FieldOption TERMVECTOR_WITH_POSITION = new FieldOption ("TERMVECTOR_WITH_POSITION");
106 /** All fields with termvectors with offset values enabled */
107 public static final FieldOption TERMVECTOR_WITH_OFFSET = new FieldOption ("TERMVECTOR_WITH_OFFSET");
108 /** All fields with termvectors with offset values and position values enabled */
109 public static final FieldOption TERMVECTOR_WITH_POSITION_OFFSET = new FieldOption ("TERMVECTOR_WITH_POSITION_OFFSET");
110 }
111
112 private boolean closed;
113 protected boolean hasChanges;
114
115 private volatile int refCount;
116
117 // for testing
118 synchronized int getRefCount() {
119 return refCount;
120 }
121
122 /**
123 * Expert: increments the refCount of this IndexReader
124 * instance. RefCounts are used to determine when a
125 * reader can be closed safely, i.e. as soon as there are
126 * no more references. Be sure to always call a
127 * corresponding {@link #decRef}, in a finally clause;
128 * otherwise the reader may never be closed. Note that
129 * {@link #close} simply calls decRef(), which means that
130 * the IndexReader will not really be closed until {@link
131 * #decRef} has been called for all outstanding
132 * references.
133 *
134 * @see #decRef
135 */
136 public synchronized void incRef() {
137 assert refCount > 0;
138 ensureOpen();
139 refCount++;
140 }
141
142 /**
143 * Expert: decreases the refCount of this IndexReader
144 * instance. If the refCount drops to 0, then pending
145 * changes (if any) are committed to the index and this
146 * reader is closed.
147 *
148 * @throws IOException in case an IOException occurs in commit() or doClose()
149 *
150 * @see #incRef
151 */
152 public synchronized void decRef() throws IOException {
153 assert refCount > 0;
154 ensureOpen();
155 if (refCount == 1) {
156 commit();
157 doClose();
158 }
159 refCount--;
160 }
161
162 /**
163 * @deprecated will be deleted when IndexReader(Directory) is deleted
164 * @see #directory()
165 */
166 private Directory directory;
167
168 /**
169 * Legacy Constructor for backwards compatibility.
170 *
171 * <p>
172 * This Constructor should not be used, it exists for backwards
173 * compatibility only to support legacy subclasses that did not "own"
174 * a specific directory, but needed to specify something to be returned
175 * by the directory() method. Future subclasses should delegate to the
176 * no arg constructor and implement the directory() method as appropriate.
177 *
178 * @param directory Directory to be returned by the directory() method
179 * @see #directory()
180 * @deprecated - use IndexReader()
181 */
182 protected IndexReader(Directory directory) {
183 this();
184 this.directory = directory;
185 }
186
187 protected IndexReader() {
188 refCount = 1;
189 }
190
191 /**
192 * @throws AlreadyClosedException if this IndexReader is closed
193 */
194 protected final void ensureOpen() throws AlreadyClosedException {
195 if (refCount <= 0) {
196 throw new AlreadyClosedException("this IndexReader is closed");
197 }
198 }
199
200 /** Returns a read/write IndexReader reading the index in an FSDirectory in the named
201 path. <b>NOTE</b>: starting in 3.0 this will return a readOnly IndexReader.
202 * @throws CorruptIndexException if the index is corrupt
203 * @throws IOException if there is a low-level IO error
204 * @param path the path to the index directory */
205 public static IndexReader open(String path) throws CorruptIndexException, IOException {
206 return open(FSDirectory.getDirectory(path), true, null, null, READ_ONLY_DEFAULT);
207 }
208
209 /** Returns a read/write IndexReader reading the index in an FSDirectory in the named
210 * path. <b>NOTE</b>: starting in 3.0 this will return a readOnly IndexReader.
211 * @param path the path to the index directory
212 * @throws CorruptIndexException if the index is corrupt
213 * @throws IOException if there is a low-level IO error
214 */
215 public static IndexReader open(File path) throws CorruptIndexException, IOException {
216 return open(FSDirectory.getDirectory(path), true, null, null, READ_ONLY_DEFAULT);
217 }
218
219 /** Returns a read/write IndexReader reading the index in
220 * the given Directory. <b>NOTE</b>: starting in 3.0 this
221 * will return a readOnly IndexReader.
222 * @param directory the index directory
223 * @throws CorruptIndexException if the index is corrupt
224 * @throws IOException if there is a low-level IO error
225 */
226 public static IndexReader open(final Directory directory) throws CorruptIndexException, IOException {
227 return open(directory, false, null, null, READ_ONLY_DEFAULT);
228 }
229
230 /** Returns a read/write or read only IndexReader reading the index in the given Directory.
231 * @param directory the index directory
232 * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
233 * @throws CorruptIndexException if the index is corrupt
234 * @throws IOException if there is a low-level IO error
235 */
236 public static IndexReader open(final Directory directory, boolean readOnly) throws CorruptIndexException, IOException {
237 return open(directory, false, null, null, readOnly);
238 }
239
240 /** Expert: returns a read/write IndexReader reading the index in the given
241 * {@link IndexCommit}. <b>NOTE</b>: starting in 3.0 this
242 * will return a readOnly IndexReader.
243 * @param commit the commit point to open
244 * @throws CorruptIndexException if the index is corrupt
245 * @throws IOException if there is a low-level IO error
246 */
247 public static IndexReader open(final IndexCommit commit) throws CorruptIndexException, IOException {
248 return open(commit.getDirectory(), false, null, commit, READ_ONLY_DEFAULT);
249 }
250
251 /** Expert: returns a read/write IndexReader reading the index in the given
252 * Directory, with a custom {@link IndexDeletionPolicy}.
253 * <b>NOTE</b>: starting in 3.0 this will return a
254 * readOnly IndexReader.
255 * @param directory the index directory
256 * @param deletionPolicy a custom deletion policy (only used
257 * if you use this reader to perform deletes or to set
258 * norms); see {@link IndexWriter} for details.
259 * @throws CorruptIndexException if the index is corrupt
260 * @throws IOException if there is a low-level IO error
261 */
262 public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, IOException {
263 return open(directory, false, deletionPolicy, null, READ_ONLY_DEFAULT);
264 }
265
266 /** Expert: returns a read/write or read only IndexReader reading the index in the given
267 * Directory, with a custom {@link IndexDeletionPolicy}.
268 * <b>NOTE</b>: starting in 3.0 this will return a
269 * readOnly IndexReader.
270 * @param directory the index directory
271 * @param deletionPolicy a custom deletion policy (only used
272 * if you use this reader to perform deletes or to set
273 * norms); see {@link IndexWriter} for details.
274 * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
275 * @throws CorruptIndexException if the index is corrupt
276 * @throws IOException if there is a low-level IO error
277 */
278 public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException {
279 return open(directory, false, deletionPolicy, null, readOnly);
280 }
281
282 /** Expert: returns a read/write IndexReader reading the index in the given
283 * Directory, using a specific commit and with a custom
284 * {@link IndexDeletionPolicy}. <b>NOTE</b>: starting in
285 * 3.0 this will return a readOnly IndexReader.
286 * @param commit the specific {@link IndexCommit} to open;
287 * see {@link IndexReader#listCommits} to list all commits
288 * in a directory
289 * @param deletionPolicy a custom deletion policy (only used
290 * if you use this reader to perform deletes or to set
291 * norms); see {@link IndexWriter} for details.
292 * @throws CorruptIndexException if the index is corrupt
293 * @throws IOException if there is a low-level IO error
294 */
295 public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, IOException {
296 return open(commit.getDirectory(), false, deletionPolicy, commit, READ_ONLY_DEFAULT);
297 }
298
299 /** Expert: returns a read/write or read only IndexReader reading the index in the given
300 * Directory, using a specific commit and with a custom {@link IndexDeletionPolicy}.
301 * @param commit the specific {@link IndexCommit} to open;
302 * see {@link IndexReader#listCommits} to list all commits
303 * in a directory
304 * @param deletionPolicy a custom deletion policy (only used
305 * if you use this reader to perform deletes or to set
306 * norms); see {@link IndexWriter} for details.
307 * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
308 * @throws CorruptIndexException if the index is corrupt
309 * @throws IOException if there is a low-level IO error
310 */
311 public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException {
312 return open(commit.getDirectory(), false, deletionPolicy, commit, readOnly);
313 }
314
315 private static IndexReader open(final Directory directory, final boolean closeDirectory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly) throws CorruptIndexException, IOException {
316 return DirectoryIndexReader.open(directory, closeDirectory, deletionPolicy, commit, readOnly);
317 }
318
319 /**
320 * Refreshes an IndexReader if the index has changed since this instance
321 * was (re)opened.
322 * <p>
323 * Opening an IndexReader is an expensive operation. This method can be used
324 * to refresh an existing IndexReader to reduce these costs. This method
325 * tries to only load segments that have changed or were created after the
326 * IndexReader was (re)opened.
327 * <p>
328 * If the index has not changed since this instance was (re)opened, then this
329 * call is a NOOP and returns this instance. Otherwise, a new instance is
330 * returned. The old instance is <b>not</b> closed and remains usable.<br>
331 * <b>Note:</b> The re-opened reader instance and the old instance might share
332 * the same resources. For this reason no index modification operations
333 * (e. g. {@link #deleteDocument(int)}, {@link #setNorm(int, String, byte)})
334 * should be performed using one of the readers until the old reader instance
335 * is closed. <b>Otherwise, the behavior of the readers is undefined.</b>
336 * <p>
337 * You can determine whether a reader was actually reopened by comparing the
338 * old instance with the instance returned by this method:
339 * <pre>
340 * IndexReader reader = ...
341 * ...
342 * IndexReader new = r.reopen();
343 * if (new != reader) {
344 * ... // reader was reopened
345 * reader.close();
346 * }
347 * reader = new;
348 * ...
349 * </pre>
350 *
351 * @throws CorruptIndexException if the index is corrupt
352 * @throws IOException if there is a low-level IO error
353 */
354 public synchronized IndexReader reopen() throws CorruptIndexException, IOException {
355 throw new UnsupportedOperationException("This reader does not support reopen().");
356 }
357
358 /**
359 * Returns the directory associated with this index. The Default
360 * implementation returns the directory specified by subclasses when
361 * delegating to the IndexReader(Directory) constructor, or throws an
362 * UnsupportedOperationException if one was not specified.
363 * @throws UnsupportedOperationException if no directory
364 */
365 public Directory directory() {
366 ensureOpen();
367 if (null != directory) {
368 return directory;
369 } else {
370 throw new UnsupportedOperationException("This reader does not support this method.");
371 }
372 }
373
374 /**
375 * Returns the time the index in the named directory was last modified.
376 * Do not use this to check whether the reader is still up-to-date, use
377 * {@link #isCurrent()} instead.
378 * @throws CorruptIndexException if the index is corrupt
379 * @throws IOException if there is a low-level IO error
380 */
381 public static long lastModified(String directory) throws CorruptIndexException, IOException {
382 return lastModified(new File(directory));
383 }
384
385 /**
386 * Returns the time the index in the named directory was last modified.
387 * Do not use this to check whether the reader is still up-to-date, use
388 * {@link #isCurrent()} instead.
389 * @throws CorruptIndexException if the index is corrupt
390 * @throws IOException if there is a low-level IO error
391 */
392 public static long lastModified(File fileDirectory) throws CorruptIndexException, IOException {
393 return ((Long) new SegmentInfos.FindSegmentsFile(fileDirectory) {
394 public Object doBody(String segmentFileName) {
395 return new Long(FSDirectory.fileModified(fileDirectory, segmentFileName));
396 }
397 }.run()).longValue();
398 }
399
400 /**
401 * Returns the time the index in the named directory was last modified.
402 * Do not use this to check whether the reader is still up-to-date, use
403 * {@link #isCurrent()} instead.
404 * @throws CorruptIndexException if the index is corrupt
405 * @throws IOException if there is a low-level IO error
406 */
407 public static long lastModified(final Directory directory2) throws CorruptIndexException, IOException {
408 return ((Long) new SegmentInfos.FindSegmentsFile(directory2) {
409 public Object doBody(String segmentFileName) throws IOException {
410 return new Long(directory2.fileModified(segmentFileName));
411 }
412 }.run()).longValue();
413 }
414
415 /**
416 * Reads version number from segments files. The version number is
417 * initialized with a timestamp and then increased by one for each change of
418 * the index.
419 *
420 * @param directory where the index resides.
421 * @return version number.
422 * @throws CorruptIndexException if the index is corrupt
423 * @throws IOException if there is a low-level IO error
424 */
425 public static long getCurrentVersion(String directory) throws CorruptIndexException, IOException {
426 return getCurrentVersion(new File(directory));
427 }
428
429 /**
430 * Reads version number from segments files. The version number is
431 * initialized with a timestamp and then increased by one for each change of
432 * the index.
433 *
434 * @param directory where the index resides.
435 * @return version number.
436 * @throws CorruptIndexException if the index is corrupt
437 * @throws IOException if there is a low-level IO error
438 */
439 public static long getCurrentVersion(File directory) throws CorruptIndexException, IOException {
440 Directory dir = FSDirectory.getDirectory(directory);
441 long version = getCurrentVersion(dir);
442 dir.close();
443 return version;
444 }
445
446 /**
447 * Reads version number from segments files. The version number is
448 * initialized with a timestamp and then increased by one for each change of
449 * the index.
450 *
451 * @param directory where the index resides.
452 * @return version number.
453 * @throws CorruptIndexException if the index is corrupt
454 * @throws IOException if there is a low-level IO error
455 */
456 public static long getCurrentVersion(Directory directory) throws CorruptIndexException, IOException {
457 return SegmentInfos.readCurrentVersion(directory);
458 }
459
460 /**
461 * Version number when this IndexReader was opened. Not implemented in the IndexReader base class.
462 * @throws UnsupportedOperationException unless overridden in subclass
463 */
464 public long getVersion() {
465 throw new UnsupportedOperationException("This reader does not support this method.");
466 }
467
468 /**<p>For IndexReader implementations that use
469 * TermInfosReader to read terms, this sets the
470 * indexDivisor to subsample the number of indexed terms
471 * loaded into memory. This has the same effect as {@link
472 * IndexWriter#setTermIndexInterval} except that setting
473 * must be done at indexing time while this setting can be
474 * set per reader. When set to N, then one in every
475 * N*termIndexInterval terms in the index is loaded into
476 * memory. By setting this to a value > 1 you can reduce
477 * memory usage, at the expense of higher latency when
478 * loading a TermInfo. The default value is 1.</p>
479 *
480 * <b>NOTE:</b> you must call this before the term
481 * index is loaded. If the index is already loaded,
482 * an IllegalStateException is thrown.
483 * @throws IllegalStateException if the term index has already been loaded into memory
484 */
485 public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException {
486 throw new UnsupportedOperationException("This reader does not support this method.");
487 }
488
489 /** <p>For IndexReader implementations that use
490 * TermInfosReader to read terms, this returns the
491 * current indexDivisor.
492 * @see #setTermInfosIndexDivisor */
493 public int getTermInfosIndexDivisor() {
494 throw new UnsupportedOperationException("This reader does not support this method.");
495 }
496
497 /**
498 * Check whether this IndexReader is still using the
499 * current (i.e., most recently committed) version of the
500 * index. If a writer has committed any changes to the
501 * index since this reader was opened, this will return
502 * <code>false</code>, in which case you must open a new
503 * IndexReader in order to see the changes. See the
504 * description of the <a href="IndexWriter.html#autoCommit"><code>autoCommit</code></a>
505 * flag which controls when the {@link IndexWriter}
506 * actually commits changes to the index.
507 *
508 * <p>
509 * Not implemented in the IndexReader base class.
510 * </p>
511 * @throws CorruptIndexException if the index is corrupt
512 * @throws IOException if there is a low-level IO error
513 * @throws UnsupportedOperationException unless overridden in subclass
514 */
515 public boolean isCurrent() throws CorruptIndexException, IOException {
516 throw new UnsupportedOperationException("This reader does not support this method.");
517 }
518
519 /**
520 * Checks is the index is optimized (if it has a single segment and
521 * no deletions). Not implemented in the IndexReader base class.
522 * @return <code>true</code> if the index is optimized; <code>false</code> otherwise
523 * @throws UnsupportedOperationException unless overridden in subclass
524 */
525 public boolean isOptimized() {
526 throw new UnsupportedOperationException("This reader does not support this method.");
527 }
528
529 /**
530 * Return an array of term frequency vectors for the specified document.
531 * The array contains a vector for each vectorized field in the document.
532 * Each vector contains terms and frequencies for all terms in a given vectorized field.
533 * If no such fields existed, the method returns null. The term vectors that are
534 * returned my either be of type TermFreqVector or of type TermPositionsVector if
535 * positions or offsets have been stored.
536 *
537 * @param docNumber document for which term frequency vectors are returned
538 * @return array of term frequency vectors. May be null if no term vectors have been
539 * stored for the specified document.
540 * @throws IOException if index cannot be accessed
541 * @see org.apache.lucene.document.Field.TermVector
542 */
543 abstract public TermFreqVector[] getTermFreqVectors(int docNumber)
544 throws IOException;
545
546
547 /**
548 * Return a term frequency vector for the specified document and field. The
549 * returned vector contains terms and frequencies for the terms in
550 * the specified field of this document, if the field had the storeTermVector
551 * flag set. If termvectors had been stored with positions or offsets, a
552 * TermPositionsVector is returned.
553 *
554 * @param docNumber document for which the term frequency vector is returned
555 * @param field field for which the term frequency vector is returned.
556 * @return term frequency vector May be null if field does not exist in the specified
557 * document or term vector was not stored.
558 * @throws IOException if index cannot be accessed
559 * @see org.apache.lucene.document.Field.TermVector
560 */
561 abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
562 throws IOException;
563
564 /**
565 * Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of
566 * the {@link TermFreqVector}.
567 * @param docNumber The number of the document to load the vector for
568 * @param field The name of the field to load
569 * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
570 * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
571 *
572 */
573 abstract public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException;
574
575 /**
576 * Map all the term vectors for all fields in a Document
577 * @param docNumber The number of the document to load the vector for
578 * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
579 * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
580 */
581 abstract public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException;
582
583 /**
584 * Returns <code>true</code> if an index exists at the specified directory.
585 * If the directory does not exist or if there is no index in it.
586 * <code>false</code> is returned.
587 * @param directory the directory to check for an index
588 * @return <code>true</code> if an index exists; <code>false</code> otherwise
589 */
590 public static boolean indexExists(String directory) {
591 return indexExists(new File(directory));
592 }
593
594 /**
595 * Returns <code>true</code> if an index exists at the specified directory.
596 * If the directory does not exist or if there is no index in it.
597 * @param directory the directory to check for an index
598 * @return <code>true</code> if an index exists; <code>false</code> otherwise
599 */
600
601 public static boolean indexExists(File directory) {
602 return SegmentInfos.getCurrentSegmentGeneration(directory.list()) != -1;
603 }
604
605 /**
606 * Returns <code>true</code> if an index exists at the specified directory.
607 * If the directory does not exist or if there is no index in it.
608 * @param directory the directory to check for an index
609 * @return <code>true</code> if an index exists; <code>false</code> otherwise
610 * @throws IOException if there is a problem with accessing the index
611 */
612 public static boolean indexExists(Directory directory) throws IOException {
613 return SegmentInfos.getCurrentSegmentGeneration(directory) != -1;
614 }
615
616 /** Returns the number of documents in this index. */
617 public abstract int numDocs();
618
619 /** Returns one greater than the largest possible document number.
620 * This may be used to, e.g., determine how big to allocate an array which
621 * will have an element for every document number in an index.
622 */
623 public abstract int maxDoc();
624
625 /** Returns the number of deleted documents. */
626 public int numDeletedDocs() {
627 return maxDoc() - numDocs();
628 }
629
630 /** Returns the stored fields of the <code>n</code><sup>th</sup>
631 <code>Document</code> in this index.
632 * @throws CorruptIndexException if the index is corrupt
633 * @throws IOException if there is a low-level IO error
634 */
635 public Document document(int n) throws CorruptIndexException, IOException {
636 ensureOpen();
637 return document(n, null);
638 }
639
640 /**
641 * Get the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup> position. The {@link org.apache.lucene.document.FieldSelector}
642 * may be used to determine what {@link org.apache.lucene.document.Field}s to load and how they should be loaded.
643 *
644 * <b>NOTE:</b> If this Reader (more specifically, the underlying <code>FieldsReader</code>) is closed before the lazy {@link org.apache.lucene.document.Field} is
645 * loaded an exception may be thrown. If you want the value of a lazy {@link org.apache.lucene.document.Field} to be available after closing you must
646 * explicitly load it or fetch the Document again with a new loader.
647 *
648 *
649 * @param n Get the document at the <code>n</code><sup>th</sup> position
650 * @param fieldSelector The {@link org.apache.lucene.document.FieldSelector} to use to determine what Fields should be loaded on the Document. May be null, in which case all Fields will be loaded.
651 * @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position
652 * @throws CorruptIndexException if the index is corrupt
653 * @throws IOException if there is a low-level IO error
654 *
655 * @see org.apache.lucene.document.Fieldable
656 * @see org.apache.lucene.document.FieldSelector
657 * @see org.apache.lucene.document.SetBasedFieldSelector
658 * @see org.apache.lucene.document.LoadFirstFieldSelector
659 */
660 //When we convert to JDK 1.5 make this Set<String>
661 public abstract Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException;
662
663
664
665 /** Returns true if document <i>n</i> has been deleted */
666 public abstract boolean isDeleted(int n);
667
668 /** Returns true if any documents have been deleted */
669 public abstract boolean hasDeletions();
670
671 /** Returns true if there are norms stored for this field. */
672 public boolean hasNorms(String field) throws IOException {
673 // backward compatible implementation.
674 // SegmentReader has an efficient implementation.
675 ensureOpen();
676 return norms(field) != null;
677 }
678
679 /** Returns the byte-encoded normalization factor for the named field of
680 * every document. This is used by the search code to score documents.
681 *
682 * @see org.apache.lucene.document.Field#setBoost(float)
683 */
684 public abstract byte[] norms(String field) throws IOException;
685
686 /** Reads the byte-encoded normalization factor for the named field of every
687 * document. This is used by the search code to score documents.
688 *
689 * @see org.apache.lucene.document.Field#setBoost(float)
690 */
691 public abstract void norms(String field, byte[] bytes, int offset)
692 throws IOException;
693
694 /** Expert: Resets the normalization factor for the named field of the named
695 * document. The norm represents the product of the field's {@link
696 * org.apache.lucene.document.Fieldable#setBoost(float) boost} and its {@link Similarity#lengthNorm(String,
697 * int) length normalization}. Thus, to preserve the length normalization
698 * values when resetting this, one should base the new value upon the old.
699 *
700 * @see #norms(String)
701 * @see Similarity#decodeNorm(byte)
702 * @throws StaleReaderException if the index has changed
703 * since this reader was opened
704 * @throws CorruptIndexException if the index is corrupt
705 * @throws LockObtainFailedException if another writer
706 * has this index open (<code>write.lock</code> could not
707 * be obtained)
708 * @throws IOException if there is a low-level IO error
709 */
710 public synchronized void setNorm(int doc, String field, byte value)
711 throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
712 ensureOpen();
713 acquireWriteLock();
714 hasChanges = true;
715 doSetNorm(doc, field, value);
716 }
717
718 /** Implements setNorm in subclass.*/
719 protected abstract void doSetNorm(int doc, String field, byte value)
720 throws CorruptIndexException, IOException;
721
722 /** Expert: Resets the normalization factor for the named field of the named
723 * document.
724 *
725 * @see #norms(String)
726 * @see Similarity#decodeNorm(byte)
727 *
728 * @throws StaleReaderException if the index has changed
729 * since this reader was opened
730 * @throws CorruptIndexException if the index is corrupt
731 * @throws LockObtainFailedException if another writer
732 * has this index open (<code>write.lock</code> could not
733 * be obtained)
734 * @throws IOException if there is a low-level IO error
735 */
736 public void setNorm(int doc, String field, float value)
737 throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
738 ensureOpen();
739 setNorm(doc, field, Similarity.encodeNorm(value));
740 }
741
742 /** Returns an enumeration of all the terms in the index. The
743 * enumeration is ordered by Term.compareTo(). Each term is greater
744 * than all that precede it in the enumeration. Note that after
745 * calling terms(), {@link TermEnum#next()} must be called
746 * on the resulting enumeration before calling other methods such as
747 * {@link TermEnum#term()}.
748 * @throws IOException if there is a low-level IO error
749 */
750 public abstract TermEnum terms() throws IOException;
751
752 /** Returns an enumeration of all terms starting at a given term. If
753 * the given term does not exist, the enumeration is positioned at the
754 * first term greater than the supplied term. The enumeration is
755 * ordered by Term.compareTo(). Each term is greater than all that
756 * precede it in the enumeration.
757 * @throws IOException if there is a low-level IO error
758 */
759 public abstract TermEnum terms(Term t) throws IOException;
760
761 /** Returns the number of documents containing the term <code>t</code>.
762 * @throws IOException if there is a low-level IO error
763 */
764 public abstract int docFreq(Term t) throws IOException;
765
766 /** Returns an enumeration of all the documents which contain
767 * <code>term</code>. For each document, the document number, the frequency of
768 * the term in that document is also provided, for use in search scoring.
769 * Thus, this method implements the mapping:
770 * <p><ul>
771 * Term => <docNum, freq><sup>*</sup>
772 * </ul>
773 * <p>The enumeration is ordered by document number. Each document number
774 * is greater than all that precede it in the enumeration.
775 * @throws IOException if there is a low-level IO error
776 */
777 public TermDocs termDocs(Term term) throws IOException {
778 ensureOpen();
779 TermDocs termDocs = termDocs();
780 termDocs.seek(term);
781 return termDocs;
782 }
783
784 /** Returns an unpositioned {@link TermDocs} enumerator.
785 * @throws IOException if there is a low-level IO error
786 */
787 public abstract TermDocs termDocs() throws IOException;
788
789 /** Returns an enumeration of all the documents which contain
790 * <code>term</code>. For each document, in addition to the document number
791 * and frequency of the term in that document, a list of all of the ordinal
792 * positions of the term in the document is available. Thus, this method
793 * implements the mapping:
794 *
795 * <p><ul>
796 * Term => <docNum, freq,
797 * <pos<sub>1</sub>, pos<sub>2</sub>, ...
798 * pos<sub>freq-1</sub>>
799 * ><sup>*</sup>
800 * </ul>
801 * <p> This positional information facilitates phrase and proximity searching.
802 * <p>The enumeration is ordered by document number. Each document number is
803 * greater than all that precede it in the enumeration.
804 * @throws IOException if there is a low-level IO error
805 */
806 public TermPositions termPositions(Term term) throws IOException {
807 ensureOpen();
808 TermPositions termPositions = termPositions();
809 termPositions.seek(term);
810 return termPositions;
811 }
812
813 /** Returns an unpositioned {@link TermPositions} enumerator.
814 * @throws IOException if there is a low-level IO error
815 */
816 public abstract TermPositions termPositions() throws IOException;
817
818
819
820 /** Deletes the document numbered <code>docNum</code>. Once a document is
821 * deleted it will not appear in TermDocs or TermPostitions enumerations.
822 * Attempts to read its field with the {@link #document}
823 * method will result in an error. The presence of this document may still be
824 * reflected in the {@link #docFreq} statistic, though
825 * this will be corrected eventually as the index is further modified.
826 *
827 * @throws StaleReaderException if the index has changed
828 * since this reader was opened
829 * @throws CorruptIndexException if the index is corrupt
830 * @throws LockObtainFailedException if another writer
831 * has this index open (<code>write.lock</code> could not
832 * be obtained)
833 * @throws IOException if there is a low-level IO error
834 */
835 public synchronized void deleteDocument(int docNum) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
836 ensureOpen();
837 acquireWriteLock();
838 hasChanges = true;
839 doDelete(docNum);
840 }
841
842
843 /** Implements deletion of the document numbered <code>docNum</code>.
844 * Applications should call {@link #deleteDocument(int)} or {@link #deleteDocuments(Term)}.
845 */
846 protected abstract void doDelete(int docNum) throws CorruptIndexException, IOException;
847
848
849 /** Deletes all documents that have a given <code>term</code> indexed.
850 * This is useful if one uses a document field to hold a unique ID string for
851 * the document. Then to delete such a document, one merely constructs a
852 * term with the appropriate field and the unique ID string as its text and
853 * passes it to this method.
854 * See {@link #deleteDocument(int)} for information about when this deletion will
855 * become effective.
856 *
857 * @return the number of documents deleted
858 * @throws StaleReaderException if the index has changed
859 * since this reader was opened
860 * @throws CorruptIndexException if the index is corrupt
861 * @throws LockObtainFailedException if another writer
862 * has this index open (<code>write.lock</code> could not
863 * be obtained)
864 * @throws IOException if there is a low-level IO error
865 */
866 public int deleteDocuments(Term term) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
867 ensureOpen();
868 TermDocs docs = termDocs(term);
869 if (docs == null) return 0;
870 int n = 0;
871 try {
872 while (docs.next()) {
873 deleteDocument(docs.doc());
874 n++;
875 }
876 } finally {
877 docs.close();
878 }
879 return n;
880 }
881
882 /** Undeletes all documents currently marked as deleted in this index.
883 *
884 * @throws StaleReaderException if the index has changed
885 * since this reader was opened
886 * @throws LockObtainFailedException if another writer
887 * has this index open (<code>write.lock</code> could not
888 * be obtained)
889 * @throws CorruptIndexException if the index is corrupt
890 * @throws IOException if there is a low-level IO error
891 */
892 public synchronized void undeleteAll() throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
893 ensureOpen();
894 acquireWriteLock();
895 hasChanges = true;
896 doUndeleteAll();
897 }
898
899 /** Implements actual undeleteAll() in subclass. */
900 protected abstract void doUndeleteAll() throws CorruptIndexException, IOException;
901
902 /** Does nothing by default. Subclasses that require a write lock for
903 * index modifications must implement this method. */
904 protected synchronized void acquireWriteLock() throws IOException {
905 /* NOOP */
906 }
907
908 /**
909 *
910 * @throws IOException
911 */
912 public final synchronized void flush() throws IOException {
913 ensureOpen();
914 commit();
915 }
916
917 /**
918 * Commit changes resulting from delete, undeleteAll, or
919 * setNorm operations
920 *
921 * If an exception is hit, then either no changes or all
922 * changes will have been committed to the index
923 * (transactional semantics).
924 * @throws IOException if there is a low-level IO error
925 */
926 protected final synchronized void commit() throws IOException {
927 if(hasChanges){
928 doCommit();
929 }
930 hasChanges = false;
931 }
932
933 /** Implements commit. */
934 protected abstract void doCommit() throws IOException;
935
936 /**
937 * Closes files associated with this index.
938 * Also saves any new deletions to disk.
939 * No other methods should be called after this has been called.
940 * @throws IOException if there is a low-level IO error
941 */
942 public final synchronized void close() throws IOException {
943 if (!closed) {
944 decRef();
945 closed = true;
946 }
947 }
948
949 /** Implements close. */
950 protected abstract void doClose() throws IOException;
951
952
953 /**
954 * Get a list of unique field names that exist in this index and have the specified
955 * field option information.
956 * @param fldOption specifies which field option should be available for the returned fields
957 * @return Collection of Strings indicating the names of the fields.
958 * @see IndexReader.FieldOption
959 */
960 public abstract Collection getFieldNames(FieldOption fldOption);
961
962 /**
963 * Returns <code>true</code> iff the index in the named directory is
964 * currently locked.
965 * @param directory the directory to check for a lock
966 * @throws IOException if there is a low-level IO error
967 * @deprecated Please use {@link IndexWriter#isLocked(Directory)} instead
968 */
969 public static boolean isLocked(Directory directory) throws IOException {
970 return
971 directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked();
972 }
973
974 /**
975 * Returns <code>true</code> iff the index in the named directory is
976 * currently locked.
977 * @param directory the directory to check for a lock
978 * @throws IOException if there is a low-level IO error
979 * @deprecated Please use {@link IndexWriter#isLocked(String)} instead
980 */
981 public static boolean isLocked(String directory) throws IOException {
982 Directory dir = FSDirectory.getDirectory(directory);
983 boolean result = isLocked(dir);
984 dir.close();
985 return result;
986 }
987
988 /**
989 * Forcibly unlocks the index in the named directory.
990 * <P>
991 * Caution: this should only be used by failure recovery code,
992 * when it is known that no other process nor thread is in fact
993 * currently accessing this index.
994 * @deprecated Please use {@link IndexWriter#unlock(Directory)} instead
995 */
996 public static void unlock(Directory directory) throws IOException {
997 directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release();
998 }
999
1000 /**
1001 * Expert: return the IndexCommit that this reader has
1002 * opened. This method is only implemented by those
1003 * readers that correspond to a Directory with its own
1004 * segments_N file.
1005 *
1006 * <p><b>WARNING</b>: this API is new and experimental and
1007 * may suddenly change.</p>
1008 */
1009 public IndexCommit getIndexCommit() throws IOException {
1010 throw new UnsupportedOperationException("This reader does not support this method.");
1011 }
1012
1013 /**
1014 * Prints the filename and size of each file within a given compound file.
1015 * Add the -extract flag to extract files to the current working directory.
1016 * In order to make the extracted version of the index work, you have to copy
1017 * the segments file from the compound index into the directory where the extracted files are stored.
1018 * @param args Usage: org.apache.lucene.index.IndexReader [-extract] <cfsfile>
1019 */
1020 public static void main(String [] args) {
1021 String filename = null;
1022 boolean extract = false;
1023
1024 for (int i = 0; i < args.length; ++i) {
1025 if (args[i].equals("-extract")) {
1026 extract = true;
1027 } else if (filename == null) {
1028 filename = args[i];
1029 }
1030 }
1031
1032 if (filename == null) {
1033 System.out.println("Usage: org.apache.lucene.index.IndexReader [-extract] <cfsfile>");
1034 return;
1035 }
1036
1037 Directory dir = null;
1038 CompoundFileReader cfr = null;
1039
1040 try {
1041 File file = new File(filename);
1042 String dirname = file.getAbsoluteFile().getParent();
1043 filename = file.getName();
1044 dir = FSDirectory.getDirectory(dirname);
1045 cfr = new CompoundFileReader(dir, filename);
1046
1047 String [] files = cfr.list();
1048 Arrays.sort(files); // sort the array of filename so that the output is more readable
1049
1050 for (int i = 0; i < files.length; ++i) {
1051 long len = cfr.fileLength(files[i]);
1052
1053 if (extract) {
1054 System.out.println("extract " + files[i] + " with " + len + " bytes to local directory...");
1055 IndexInput ii = cfr.openInput(files[i]);
1056
1057 FileOutputStream f = new FileOutputStream(files[i]);
1058
1059 // read and write with a small buffer, which is more effectiv than reading byte by byte
1060 byte[] buffer = new byte[1024];
1061 int chunk = buffer.length;
1062 while(len > 0) {
1063 final int bufLen = (int) Math.min(chunk, len);
1064 ii.readBytes(buffer, 0, bufLen);
1065 f.write(buffer, 0, bufLen);
1066 len -= bufLen;
1067 }
1068
1069 f.close();
1070 ii.close();
1071 }
1072 else
1073 System.out.println(files[i] + ": " + len + " bytes");
1074 }
1075 } catch (IOException ioe) {
1076 ioe.printStackTrace();
1077 }
1078 finally {
1079 try {
1080 if (dir != null)
1081 dir.close();
1082 if (cfr != null)
1083 cfr.close();
1084 }
1085 catch (IOException ioe) {
1086 ioe.printStackTrace();
1087 }
1088 }
1089 }
1090
1091 /** Returns all commit points that exist in the Directory.
1092 * Normally, because the default is {@link
1093 * KeepOnlyLastCommitDeletionPolicy}, there would be only
1094 * one commit point. But if you're using a custom {@link
1095 * IndexDeletionPolicy} then there could be many commits.
1096 * Once you have a given commit, you can open a reader on
1097 * it by calling {@link IndexReader#open(IndexCommit)}
1098 * There must be at least one commit in
1099 * the Directory, else this method throws {@link
1100 * java.io.IOException}. Note that if a commit is in
1101 * progress while this method is running, that commit
1102 * may or may not be returned array. */
1103 public static Collection listCommits(Directory dir) throws IOException {
1104 return DirectoryIndexReader.listCommits(dir);
1105 }
1106}
1107| IndexReader.java |