用户名: 密 码:
您现在的位置:首页 >> SEO开发技巧 >> 内容

lucene检索技巧汇总

时间:2010/4/7 20:57:07 点击:8242

  核心提示:package org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the N...

package org.apache.lucene.search;  

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import org.apache.lucene.document.Document;  
import org.apache.lucene.document.FieldSelector;  
import org.apache.lucene.index.CorruptIndexException;  
import org.apache.lucene.index.IndexReader;  
import org.apache.lucene.index.Term;  
import org.apache.lucene.store.Directory;  

import java.io.IOException;  

/** Implements search over a single IndexReader.
*
* <p>Applications usually need only call the inherited {@link #search(Query)}
* or {@link #search(Query,Filter)} methods. For performance reasons it is  
* recommended to open only one IndexSearcher and use it for all of your searches.
*  
* <p>Note that you can only access Hits from an IndexSearcher as long as it is
* not yet closed, otherwise an IOException will be thrown.  
*/
public class IndexSearcher extends Searcher {  
IndexReader reader;  
private boolean closeReader;  

/** Creates a searcher searching the index in the named directory.
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   */
public IndexSearcher(String path) throws CorruptIndexException, IOException {  
    this(IndexReader.open(path), true);  
}  

/** Creates a searcher searching the index in the provided directory.
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   */
public IndexSearcher(Directory directory) throws CorruptIndexException, IOException {  
    this(IndexReader.open(directory), true);  
}  

/** Creates a searcher searching the provided index. */
public IndexSearcher(IndexReader r) {  
    this(r, false);  
}  
    
private IndexSearcher(IndexReader r, boolean closeReader) {  
    reader = r;  
    this.closeReader = closeReader;  
}  

/** Return the {@link IndexReader} this searches. */
public IndexReader getIndexReader() {  
    return reader;  
}  

/**
   * Note that the underlying IndexReader is not closed, if
   * IndexSearcher was constructed with IndexSearcher(IndexReader r).
   * If the IndexReader was supplied implicitly by specifying a directory, then
   * the IndexReader gets closed.
   */
public void close() throws IOException {  
    if(closeReader)  
      reader.close();  
}  

// inherit javadoc  
public int docFreq(Term term) throws IOException {  
    return reader.docFreq(term);  
}  

// inherit javadoc  
public Document doc(int i) throws CorruptIndexException, IOException {  
    return reader.document(i);  
}  
    
// inherit javadoc  
public Document doc(int i, FieldSelector fieldSelector) throws CorruptIndexException, IOException {  
        return reader.document(i, fieldSelector);  
}  
    
// inherit javadoc  
public int maxDoc() throws IOException {  
    return reader.maxDoc();  
}  

// inherit javadoc  
public TopDocs search(Weight weight, Filter filter, final int nDocs)  
       throws IOException {  

    if (nDocs <= 0) // null might be returned from hq.top() below.  
      throw new IllegalArgumentException("nDocs must be > 0");  

    TopDocCollector collector = new TopDocCollector(nDocs);  
    search(weight, filter, collector);  
    return collector.topDocs();  
}  

// inherit javadoc  
public TopFieldDocs search(Weight weight, Filter filter, final int nDocs,  
                             Sort sort)  
      throws IOException {  

    TopFieldDocCollector collector =  
      new TopFieldDocCollector(reader, sort, nDocs);  
    search(weight, filter, collector);  
    return (TopFieldDocs)collector.topDocs();  
}  

// inherit javadoc  
public void search(Weight weight, Filter filter,  
                     final HitCollector results) throws IOException {  

    Scorer scorer = weight.scorer(reader);  
    if (scorer == null)  
      return;  

    if (filter == null) {  
      scorer.score(results);  
      return;  
    }  

    DocIdSetIterator filterDocIdIterator = filter.getDocIdSet(reader).iterator(); // CHECKME: use ConjunctionScorer here?  
      
    boolean more = filterDocIdIterator.next() && scorer.skipTo(filterDocIdIterator.doc());  

    while (more) {  
      int filterDocId = filterDocIdIterator.doc();  
      if (filterDocId > scorer.doc() && !scorer.skipTo(filterDocId)) {  
        more = false;  
      } else {  
        int scorerDocId = scorer.doc();  
        if (scorerDocId == filterDocId) { // permitted by filter  
          results.collect(scorerDocId, scorer.score());  
          more = filterDocIdIterator.next();  
        } else {  
          more = filterDocIdIterator.skipTo(scorerDocId);  
        }  
      }  
    }  
}  

public Query rewrite(Query original) throws IOException {  
    Query query = original;  
    for (Query rewrittenQuery = query.rewrite(reader); rewrittenQuery != query;  
         rewrittenQuery = query.rewrite(reader)) {  
      query = rewrittenQuery;  
    }  
    return query;  
}  

public Explanation explain(Weight weight, int doc) throws IOException {  
    return weight.explain(reader, doc);  
}  
}
package org.apache.lucene.search;

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;

import java.io.IOException;

/** Implements search over a single IndexReader.
*
* <p>Applications usually need only call the inherited {@link #search(Query)}
* or {@link #search(Query,Filter)} methods. For performance reasons it is
* recommended to open only one IndexSearcher and use it for all of your searches.
*
* <p>Note that you can only access Hits from an IndexSearcher as long as it is
* not yet closed, otherwise an IOException will be thrown.
*/
public class IndexSearcher extends Searcher {
IndexReader reader;
private boolean closeReader;

/** Creates a searcher searching the index in the named directory.
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   */
public IndexSearcher(String path) throws CorruptIndexException, IOException {
    this(IndexReader.open(path), true);
}

/** Creates a searcher searching the index in the provided directory.
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   */
public IndexSearcher(Directory directory) throws CorruptIndexException, IOException {
    this(IndexReader.open(directory), true);
}

/** Creates a searcher searching the provided index. */
public IndexSearcher(IndexReader r) {
    this(r, false);
}

private IndexSearcher(IndexReader r, boolean closeReader) {
    reader = r;
    this.closeReader = closeReader;
}

/** Return the {@link IndexReader} this searches. */
public IndexReader getIndexReader() {
    return reader;
}

/**
   * Note that the underlying IndexReader is not closed, if
   * IndexSearcher was constructed with IndexSearcher(IndexReader r).
   * If the IndexReader was supplied implicitly by specifying a directory, then
   * the IndexReader gets closed.
   */
public void close() throws IOException {
    if(closeReader)
      reader.close();
}

// inherit javadoc
public int docFreq(Term term) throws IOException {
    return reader.docFreq(term);
}

// inherit javadoc
public Document doc(int i) throws CorruptIndexException, IOException {
    return reader.document(i);
}

// inherit javadoc
public Document doc(int i, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
     return reader.document(i, fieldSelector);
}

// inherit javadoc
public int maxDoc() throws IOException {
    return reader.maxDoc();
}

// inherit javadoc
public TopDocs search(Weight weight, Filter filter, final int nDocs)
       throws IOException {

    if (nDocs <= 0) // null might be returned from hq.top() below.
      throw new IllegalArgumentException("nDocs must be > 0");

    TopDocCollector collector = new TopDocCollector(nDocs);
    search(weight, filter, collector);
    return collector.topDocs();
}

// inherit javadoc
public TopFieldDocs search(Weight weight, Filter filter, final int nDocs,
                             Sort sort)
      throws IOException {

    TopFieldDocCollector collector =
      new TopFieldDocCollector(reader, sort, nDocs);
    search(weight, filter, collector);
    return (TopFieldDocs)collector.topDocs();
}

// inherit javadoc
public void search(Weight weight, Filter filter,
                     final HitCollector results) throws IOException {

    Scorer scorer = weight.scorer(reader);
    if (scorer == null)
      return;

    if (filter == null) {
      scorer.score(results);
      return;
    }

    DocIdSetIterator filterDocIdIterator = filter.getDocIdSet(reader).iterator(); // CHECKME: use ConjunctionScorer here?
   
    boolean more = filterDocIdIterator.next() && scorer.skipTo(filterDocIdIterator.doc());

    while (more) {
      int filterDocId = filterDocIdIterator.doc();
      if (filterDocId > scorer.doc() && !scorer.skipTo(filterDocId)) {
        more = false;
      } else {
        int scorerDocId = scorer.doc();
        if (scorerDocId == filterDocId) { // permitted by filter
          results.collect(scorerDocId, scorer.score());
          more = filterDocIdIterator.next();
        } else {
          more = filterDocIdIterator.skipTo(scorerDocId);
        }
      }
    }
}

public Query rewrite(Query original) throws IOException {
    Query query = original;
    for (Query rewrittenQuery = query.rewrite(reader); rewrittenQuery != query;
         rewrittenQuery = query.rewrite(reader)) {
      query = rewrittenQuery;
    }
    return query;
}

public Explanation explain(Weight weight, int doc) throws IOException {
    return weight.explain(reader, doc);
}
}

IndexSearch类 查询器
搜索入口,继承自Search
1.public IndexSearcher(Directory directory)
使用方法
String IndexPath="D:/IndexPath";
Directory directory=FSDirectory.getDirectory(IndexPath);
IndexSearcher searcher=new IndexSearcher(directory);
支持RAM存储的索引,提高检索速度,建议使用,因为此方法将索引存放的路径与搜索分离
2.public IndexSearcher(String path)
直接操作索引目录.不支持RAM存储的索引
IndexSearcher searcher=new IndexSearcher("D:/IndexPath");
3.public IndexSearcher(IndexReader r)
IndexSearcher searcher=IndexSearcher(reader);
4.private IndexSearcher(IndexReader r, boolean closeReader)
在3的基础上对了判断在关闭IndexSearcher时是否要关闭所带的IndexReader对象的boolean类型参数

多索引目录就是要在多个索引目录的中进行比较搜索,类似概念在SQL中就是select * from TableA union select * from TableB。
IndexSearcher[] searchers = new IndexSearcher[2];
searchers[0] = new IndexSearcher(IndexPath0);
searchers[1] = new IndexSearcher(IndexPath1);

IndexSearcher类的主要方法Search 通过重载实现多种检索方式.通过其参数控制检索.
参数解释
Weigth weigth               权重 指定索引中文档重要性参数,改变默认其值
HitCollector results       保存搜索的所有结果.
Filter filter                      指定对结果进行过滤的方式
Query query                  每个Search必须的对象参数.指定检索的方式
Sort sort                        指定检索排序的方法.可自定义排序方式进行结果的排序和输出

Query有很多的子类 指定了不同的查询方式,query是用户输入的内容,analyzer是用来将用户输入的内容也作分析处理
TermQuery
Term t=new Term(”contents”,”lucene”); 构造TermQuery把查询条件视为一个key, 要求和查询内容完全匹配,比如Field.Keyword类型就可以使用TermQuery

RangeQuery    区间检索
RangeQuery 表示一个范围的搜索条件,在年龄,日期,工资等数字类的索引库中常用R,angeQuery query = new RangeQuery(begin, end, included);类似sql中betwee...and.....最后一个boolean值表示是否包含边界条件本身, 用字符表示为”[begin TO end]” 或者”{begin TO end}”

PrefixQuery    字符串前缀检索,如"sys*"

BooleanQuery 逻辑组合检索
组合的Query,你可以把各种Query添加进去并标明他们的逻辑关系,添加条件用public void add(Query query, boolean required, boolean prohibited)方法, 后两个boolean变量是              标示AND OR NOT三种关系 字符表示为” AND OR NOT” 或 “+ -” ,一个BooleanQuery中可以添加多个Query, 如果超过setMaxClauseCount(int)的值(默认1024个)的话,会抛出                  TooManyClauses错误.

PhraseQuery 短语检索
PhraseQuery所以提供了一个setSlop()参数,在查询中,lucene会尝试调整单词的距离和位置,这个参数表示可以接受调整次数限制,如果实际的内容可以在这么多步内调整为完全匹配,那么就被视为匹配.在默认情况下slop的值是0, 所以默认是不支持非严格匹配的, 通过设置slop参数(比如”red pig”匹配”red fat pig”就需要1个slop来把pig后移动1位),我们可以让lucene来模糊查询. 值得注意的是,PhraseQuery不保证前后单词的次序,在上面的例子中,”pig red”需要2个slop,也就是如果slop如果大于等于2,那么”pig red”也会被认为是匹配的.

WildcardQuery 通配符检索
使用?和*来表示一个或多个字母比如sys*可以匹配 system ,systop,systaltic…,
FuzzyQuery 模糊搜索
一般不处理中文,处理于英文的各种时态变化和复数形式,匹配结果的相关度是不一样的.

QueryParser使用
QueryParser将用户输入转为Query或者Query组, 将Query的字符表示(Query.toString)转化为实际的Query对象,

Hit搜索结果的处理:Hits对象
Hits对象是搜索结果的集合 主要有下面几个方法
1.length() ,   记录有多少条结果返回
2.doc(n)       返回第n个记录
3.id(in)         返回第n个记录的Document ID
4.score(n)       第n个记录的相关度(积分)

文章来源:http://www.xinxilong.com

作者:不详 来源:网络
相关评论
发表我的评论
  • 大名:
  • 内容:
  • 论坛群发大师(www.xinxilong.com) © 2008 版权所有 All Rights Resverved.
  • Email:4984672[at]qq.com 沪ICP备12025887号-1
  • Powered by 论坛群发大师