核心提示:索引文件里的文件命名有什么规律 引用_9.cfs _9.cfx segments_k segments.gen Java代码 privatefinalsynchronizedStringnewSegmentName(){ return'_'+Integer.toString(segmentInfos...
索引文件里的文件命名有什么规律 引用
_9.cfs
_9.cfx
segments_k
segments.gen
- private final synchronized String newSegmentName(){
- return "_"+Integer.toString(segmentInfos.counter++,Character.MAX_RADIX);
- }
private final synchronized String newSegmentName(){ return "_"+Integer.toString(segmentInfos.counter++,Character.MAX_RADIX); }
将segmentInfos.counter加1后转为36进制。前面加下划线。所以segmentInfos.counter的值表示了segment中总共文档的数量。
文档倒排。这是建立索引内存消耗最大的时刻。除了词条,还需要存储词条位置,频率等信息
- package org.apache.lucene.index;
- /**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- import java.io.IOException;
- import java.io.Reader;
- import org.apache.lucene.document.Fieldable;
- import org.apache.lucene.analysis.Token;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
- import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
- /**
- * Holds state for inverting all occurrences of a single
- * field in the document. This class doesn't do anything
- * itself; instead, it forwards the tokens produced by
- * analysis to its own consumer
- * (InvertedDocConsumerPerField). It also interacts with an
- * endConsumer (InvertedDocEndConsumerPerField).
- */
- final class DocInverterPerField extends DocFieldConsumerPerField {
- final private DocInverterPerThread perThread;
- final private FieldInfo fieldInfo;
- final InvertedDocConsumerPerField consumer;
- final InvertedDocEndConsumerPerField endConsumer;
- final DocumentsWriter.DocState docState;
- final FieldInvertState fieldState;
- public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) {
- this.perThread = perThread;
- this.fieldInfo = fieldInfo;
- docState = perThread.docState;
- fieldState = perThread.fieldState;
- this.consumer = perThread.consumer.addField(this, fieldInfo);
- this.endConsumer = perThread.endConsumer.addField(this, fieldInfo);
- }
- void abort() {
- consumer.abort();
- endConsumer.abort();
- }
- public void processFields(final Fieldable[] fields,
- final int count) throws IOException {
- fieldState.reset(docState.doc.getBoost());
- final int maxFieldLength = docState.maxFieldLength;
- final boolean doInvert = consumer.start(fields, count);
- for(int i=0;i<count;i++) {
- final Fieldable field = fields[i];
- // TODO FI: this should be "genericized" to querying
- // consumer if it wants to see this particular field
- // tokenized.
- if (field.isIndexed() && doInvert) {
- if (fieldState.length > 0)
- fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
- if (!field.isTokenized()) { // un-tokenized field
- String stringValue = field.stringValue();
- final int valueLength = stringValue.length();
- perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength);
- fieldState.attributeSource = perThread.singleTokenTokenStream;
- perThread.localTokenStream.reset();
- consumer.start(field);
- boolean success = false;
- try {
- consumer.add();
- success = true;
- } finally {
- if (!success)
- docState.docWriter.setAborting();
- }
- fieldState.offset += valueLength;
- fieldState.length++;
- fieldState.position++;
- } else { // tokenized field
- final TokenStream stream;
- final TokenStream streamValue = field.tokenStreamValue();
- if (streamValue != null)
- stream = streamValue;
- else {
- // the field does not have a TokenStream,
- // so we have to obtain one from the analyzer
- final Reader reader; // find or make Reader
- final Reader readerValue = field.readerValue();
- if (readerValue != null)
- reader = readerValue;
- else {
- String stringValue = field.stringValue();
- if (stringValue == null)
- throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
- perThread.stringReader.init(stringValue);
- reader = perThread.stringReader;
- }
- // Tokenize field and add to postingTable
- stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader);
- }
- // reset the TokenStream to the first token
- stream.reset();
- try {
- int offsetEnd = fieldState.offset-1;
- boolean useNewTokenStreamAPI = stream.useNewAPI();
- Token localToken = null;
- if (useNewTokenStreamAPI) {
- fieldState.attributeSource = stream;
- } else {
- fieldState.attributeSource = perThread.localTokenStream;
- localToken = perThread.localToken;
- }
- consumer.start(field);
- OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class);
- PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
- for(;;) {
- // If we hit an exception in stream.next below
- // (which is fairly common, eg if analyzer
- // chokes on a given document), then it's
- // non-aborting and (above) this one document
- // will be marked as deleted, but still
- // consume a docID
- Token token = null;
- /**
- token.termText 切出的词
- token.startOffset 词的起始位置
- token.endOffset 词的结束位置
- */
- if (useNewTokenStreamAPI) {
- if (!stream.incrementToken()) break;
- } else {
- token = stream.next(localToken);
- if (token == null) break;
- perThread.localTokenStream.set(token);
- }
- final int posIncr = posIncrAttribute.getPositionIncrement();
- fieldState.position += posIncr - 1;
- if (posIncr == 0)
- fieldState.numOverlap++;
- boolean success = false;
- try {
- // If we hit an exception in here, we abort
- // all buffered documents since the last
- // flush, on the likelihood that the
- // internal state of the consumer is now
- // corrupt and should not be flushed to a
- // new segment:
- consumer.add();
- success = true;
- } finally {
- if (!success)
- docState.docWriter.setAborting();
- }
- fieldState.position++;
- offsetEnd = fieldState.offset + offsetAttribute.endOffset();
- if (++fieldState.length >= maxFieldLength) {
- if (docState.infoStream != null)
- docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
- break;
- }
- }
- fieldState.offset = offsetEnd+1;
- } finally {
- stream.close();
- }
- }
- fieldState.boost *= field.getBoost();
- }
- }
- consumer.finish();
- endConsumer.finish();
- }
- }