核心提示:先建立索引Java代码 importjava.io.File; importjava.io.FileFilter; importjava.io.FileReader; importjava.io.IOException; importjava.io.Reader; importjava.util.H...
先建立索引
- import java.io.File;
- import java.io.FileFilter;
- import java.io.FileReader;
- import java.io.IOException;
- import java.io.Reader;
- import java.util.HashSet;
- import java.util.Set;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.wltea.analyzer.lucene.IKAnalyzer;
- import vrvclient.util.ConfigUtil;
- import vrvclient.util.FileListUtil;
- /**
- *
- * @author lan
- */
- public class Indexer extends Constants {
- private static final Log log = LogFactory.getLog(Indexer.class);
- public void index(File[] files, FileFilter filter) {
- if (files == null) {
- return;
- }
- // System.out.println(filter.getClass());
- Set<File> set = new HashSet<File>();
- for (File f : files) {//过滤掉不合要求的文件,如后缀,文件名等
- FileListUtil.list(f, filter, set);
- }
- File indexDir = new File(ConfigUtil.getIndexPath());//这里是获得索引文件的保存路径的
- Analyzer analyzer = new IKAnalyzer();//使用国产的IK分词器,很好很强大
- try {
- FSDirectory dir = FSDirectory.open(indexDir);//保存到硬盘上
- IndexWriter iw = new IndexWriter(dir, analyzer, !IndexReader.indexExists(dir), IndexWriter.MaxFieldLength.LIMITED);
- for (File f : set) {
- if (f.isFile()) {
- // System.out.println(f.getAbsolutePath());
- Document doc = new Document();
- Reader reader = new FileReader(f);
- doc.add(new Field(PATH, f.getAbsolutePath(), Field.Store.YES, Field.Index.ANALYZED));//保存路径
- doc.add(new Field(FILE, reader));//保存文件
- iw.addDocument(doc);
- reader.close();
- }
- }
- iw.optimize();
- iw.close();
- } catch (CorruptIndexException ex) {
- log.error(ex.getMessage(), ex);
- } catch (LockObtainFailedException ex) {
- log.error(ex.getMessage(), ex);
- } catch (IOException ex) {
- log.error(ex.getMessage(), ex);
- }
- }
- }
import java.io.File; import java.io.FileFilter; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.HashSet; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.wltea.analyzer.lucene.IKAnalyzer; import vrvclient.util.ConfigUtil; import vrvclient.util.FileListUtil; /** * * @author lan */ public class Indexer extends Constants { private static final Log log = LogFactory.getLog(Indexer.class); public void index(File[] files, FileFilter filter) { if (files == null) { return; } // System.out.println(filter.getClass()); Set<File> set = new HashSet<File>(); for (File f : files) {//过滤掉不合要求的文件,如后缀,文件名等 FileListUtil.list(f, filter, set); } File indexDir = new File(ConfigUtil.getIndexPath());//这里是获得索引文件的保存路径的 Analyzer analyzer = new IKAnalyzer();//使用国产的IK分词器,很好很强大 try { FSDirectory dir = FSDirectory.open(indexDir);//保存到硬盘上 IndexWriter iw = new IndexWriter(dir, analyzer, !IndexReader.indexExists(dir), IndexWriter.MaxFieldLength.LIMITED); for (File f : set) { if (f.isFile()) { // System.out.println(f.getAbsolutePath()); Document doc = new Document(); Reader reader = new FileReader(f); doc.add(new Field(PATH, f.getAbsolutePath(), Field.Store.YES, Field.Index.ANALYZED));//保存路径 doc.add(new Field(FILE, reader));//保存文件 iw.addDocument(doc); reader.close(); } } iw.optimize(); iw.close(); } catch (CorruptIndexException ex) { log.error(ex.getMessage(), ex); } catch (LockObtainFailedException ex) { log.error(ex.getMessage(), ex); } catch (IOException ex) { log.error(ex.getMessage(), ex); } } }
这个是搜索结果
- import java.io.File;
- import java.io.IOException;
- import java.util.HashSet;
- import java.util.Set;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.queryParser.ParseException;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.store.FSDirectory;
- import org.wltea.analyzer.lucene.IKAnalyzer;
- import vrvclient.util.ConfigUtil;
- /**
- *
- * @author lan
- */
- public class Searcher extends Constants {
- private static final Log log = LogFactory.getLog(Indexer.class);
- /**
- *
- * @param contents
- * @param combineMode true = and,false = or
- * @param limit -1=all
- */
- public Set<String> search(String[] contents, boolean combineMode, int limit) {
- Set<String> paths = new HashSet<String>();
- try {
- File indexDir = new File(ConfigUtil.getIndexPath());
- FSDirectory fsd = FSDirectory.open(indexDir);
- IndexSearcher is = new IndexSearcher(fsd, true);
- Analyzer analyzer = new IKAnalyzer();
- if (fsd.getFile().exists()) {
- QueryParser qp = new QueryParser(FILE, analyzer);
- StringBuilder sb = new StringBuilder();
- String jioner = "";
- if (combineMode) {//如果是and,则所有条件要同时满足
- jioner = "+";
- }
- boolean b = true;
- for (String s : contents) {
- s = s.replaceAll("\\s+", " AND ");//防止条件中的空格被看成“或”,把其变成“与”
- if (!b) {
- sb.append(" ");
- }
- sb.append(jioner).append("(").append(s).append(")");
- b = false;
- }
- Query q = qp.parse(sb.toString());
- log.info(q.toString());
- if (limit == -1) {
- limit = is.maxDoc();
- }
- TopDocs hits = is.search(q, limit);
- ScoreDoc[] sds = hits.scoreDocs;
- for (int i = 0; i < sds.length; i++) {
- ScoreDoc sd = sds[i];
- Document doc = is.doc(sd.doc);
- // System.out.println("Hit:(" + sd.score + ")" + doc.toString());
- //本来想同时得到命中次数,也就是词频,但是网上找到的都是老版本的,这里不能用。
- paths.add(doc.get(PATH));//返回匹配的路径
- }
- }
- is.close();
- } catch (ParseException ex) {
- log.error(ex.getMessage(), ex);
- } catch (IOException ex) {
- log.error(ex.getMessage(), ex);
- }
- return paths;
- }
- }
import java.io.File; import java.io.IOException; import java.util.HashSet; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.wltea.analyzer.lucene.IKAnalyzer; import vrvclient.util.ConfigUtil; /** * * @author lan */ public class Searcher extends Constants { private static final Log log = LogFactory.getLog(Indexer.class); /** * * @param contents * @param combineMode true = and,false = or * @param limit -1=all */ public Set<String> search(String[] contents, boolean combineMode, int limit) { Set<String> paths = new HashSet<String>(); try { File indexDir = new File(ConfigUtil.getIndexPath()); FSDirectory fsd = FSDirectory.open(indexDir); IndexSearcher is = new IndexSearcher(fsd, true); Analyzer analyzer = new IKAnalyzer(); if (fsd.getFile().exists()) { QueryParser qp = new QueryParser(FILE, analyzer); StringBuilder sb = new StringBuilder(); String jioner = ""; if (combineMode) {//如果是and,则所有条件要同时满足 jioner = "+"; } boolean b = true; for (String s : contents) { s = s.replaceAll("\\s+", " AND ");//防止条件中的空格被看成“或”,把其变成“与” if (!b) { sb.append(" "); } sb.append(jioner).append("(").append(s).append(")"); b = false; } Query q = qp.parse(sb.toString()); log.info(q.toString()); if (limit == -1) { limit = is.maxDoc(); } TopDocs hits = is.search(q, limit); ScoreDoc[] sds = hits.scoreDocs; for (int i = 0; i < sds.length; i++) { ScoreDoc sd = sds[i]; Document doc = is.doc(sd.doc); // System.out.println("Hit:(" + sd.score + ")" + doc.toString()); //本来想同时得到命中次数,也就是词频,但是网上找到的都是老版本的,这里不能用。 paths.add(doc.get(PATH));//返回匹配的路径 } } is.close(); } catch (ParseException ex) { log.error(ex.getMessage(), ex); } catch (IOException ex) { log.error(ex.getMessage(), ex); } return paths; } }
附用到的工具类:
- import java.io.File;
- import java.io.FileFilter;
- import java.util.Map;
- import java.util.Set;
- /**
- *
- * @author lan
- */
- public final class FileListUtil {
- //返回一定数量的符合要求的文件
- public static void list(File f, FileFilter filter, Set<File> set, int limit) {
- if (limit > -1 && set.size() >= limit) {
- return;
- }
- if (f == null) {
- return;
- }
- if (f.isFile()) {
- set.add(f);
- } else if (f.isDirectory()) {
- File[] files = null;
- if (filter == null) {
- files = f.listFiles();
- } else {
- files = f.listFiles(filter);
- }
- if (files != null) {
- for (File file : files) {
- list(file, filter, set, limit);
- }
- }
- }
- }
- //返回所有的符合要求的文件,不要担心set放不上,至少我的D盘资料盘都放进去都没有内存溢出
- public static void list(File f, FileFilter filter, Set<File> set) {
- if (f == null) {
- return;
- }
- if (f.isFile()) {
- set.add(f);
- } else if (f.isDirectory()) {
- File[] files = null;
- if (filter == null) {
- files = f.listFiles();
- } else {
- files = f.listFiles(filter);
- }
- if (files != null) {
- for (File file : files) {
- list(file, filter, set);
- }
- }
- }
- }
- //其实是打包工具类用到的
- public static void list(File f, FileFilter filter, String parent, Map<String, File> map) {
- if (f == null) {
- return;
- }
- String name = f.getName();
- if (parent != null) {
- name = parent + "/" + name;
- }
- if (f.isFile()) {
- map.put(name, f);
- } else if (f.isDirectory()) {
- File[] files = null;
- if (filter == null) {
- files = f.listFiles();
- } else {
- files = f.listFiles(filter);
- }
- if (files != null) {
- for (File file : files) {
- list(file, filter, name, map);
- }
- }
- }
- }
- }
import java.io.File; import java.io.FileFilter; import java.util.Map; import java.util.Set; /** * * @author lan */ public final class FileListUtil { //返回一定数量的符合要求的文件 public static void list(File f, FileFilter filter, Set<File> set, int limit) { if (limit > -1 && set.size() >= limit) { return; } if (f == null) { return; } if (f.isFile()) { set.add(f); } else if (f.isDirectory()) { File[] files = null; if (filter == null) { files = f.listFiles(); } else { files = f.listFiles(filter); } if (files != null) { for (File file : files) { list(file, filter, set, limit); } } } } //返回所有的符合要求的文件,不要担心set放不上,至少我的D盘资料盘都放进去都没有内存溢出 public static void list(File f, FileFilter filter, Set<File> set) { if (f == null) { return; } if (f.isFile()) { set.add(f); } else if (f.isDirectory()) { File[] files = null; if (filter == null) { files = f.listFiles(); } else { files = f.listFiles(filter); } if (files != null) { for (File file : files) { list(file, filter, set); } } } } //其实是打包工具类用到的 public static void list(File f, FileFilter filter, String parent, Map<String, File> map) { if (f == null) { return; } String name = f.getName(); if (parent != null) { name = parent + "/" + name; } if (f.isFile()) { map.put(name, f); } else if (f.isDirectory()) { File[] files = null; if (filter == null) { files = f.listFiles(); } else { files = f.listFiles(filter); } if (files != null) { for (File file : files) { list(file, filter, name, map); } } } } }
常量类,大部分人用接口,但是我不太喜欢用接口来保存常量,虽然更方便点,但是不合规范。
- public class Constants {//可以替换成枚举,没必要了
- protected static final String PATH = "path";
- protected static final String FILE = "file";
- }