核心提示:一个小小的搜索例子,实现对某个文件夹下的文件进行搜索这里只有主要代码,整个project在附件中,导入到MyEclipse中时根据自己的情况修改配置文件中paoding-dic-home.properties的地址,当然,前提是你必须有庖丁解牛的字典,在页面搜索“项目”,会出现结果(基本每个文件中都...
一个小小的搜索例子,实现对某个文件夹下的文件进行搜索
这里只有主要代码,整个project在附件中,导入到MyEclipse中时根据自己的情况修改配置文件中paoding-dic-home.properties的地址,当然,前提是你必须有庖丁解牛的字典,在页面搜索“项目”,会出现结果(基本每个文件中都有项目这个词)
附件中有项目T_Search,文件lucene\data,索引\lucene\index
MIndexer.java:创建索引(对文件进行创建,先把文件内容读取成String)
- public class MIndexer {
- public void createIndex() {
- long start = System.currentTimeMillis();
- try {
- // 获取Paoding中文分词器
- Analyzer analyzer = new PaodingAnalyzer();
- // indexWriter建立索引,E:\lucene\index建立索引的目录
- IndexWriter writer = new IndexWriter("E:\\lucene\\index", analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED);
- //E:\lucene\data建立索引的数据,主要是.txt、.pdf文件
- indexDocs(writer, new File("E:\\lucene\\data"));
- writer.optimize();
- writer.close();
- System.out.println("用时:" + (System.currentTimeMillis() - start) + " 毫秒");
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- // 遍历文件夹文件,对需要的文件建立索引
- static void indexDocs(IndexWriter writer, File file) throws IOException {
- if (file.canRead()) {
- if (file.isDirectory()) {
- String[] files = file.list();
- if (files != null) {
- for (int i = 0; i < files.length; i++) {
- indexDocs(writer, new File(file, files[i]));
- }
- }
- } else {
- if (file.getName().endsWith(".htm")
- || file.getName().endsWith(".html")
- || file.getName().endsWith(".jsp")
- || file.getName().endsWith(".php")
- || file.getName().endsWith(".txt")
- || file.getName().endsWith(".pdf")) {
- try {
- // 针对参数文件建立索引文档 ,一个Document就相当于一跳记录
- Document doc = new Document();
- // Field.Index.ANALYZED 文件名称 建立索引,分词
- doc.add(new Field("filename", file.getCanonicalPath(),
- Field.Store.YES, Field.Index.ANALYZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS));
- if(file.getName().endsWith(".pdf")){
- doc.add(new Field("contents", pdf2txt(file),
- Field.Store.YES, Field.Index.ANALYZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS));
- }else {
- doc.add(new Field("contents", ReadFile(file),
- Field.Store.YES, Field.Index.ANALYZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS));
- }
- writer.addDocument(doc);
- } catch (FileNotFoundException fnfe) {
- ;
- }
- }
- }
- }
- }
- // 用字符串形式,读取一个File的内容
- public static String ReadFile(File f) {
- String line = null;
- StringBuffer temp = new StringBuffer();
- try {
- BufferedReader br = new BufferedReader(new InputStreamReader(
- new FileInputStream(f), "UTF-8"));
- while ((line = br.readLine()) != null) {
- temp.append(line);
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return temp.toString();
- }
- //若文件为pdf,就用这个读取
- public static String pdf2txt(File pfile) {
- String _content = "";
- if (pfile.exists() && pfile.getName().lastIndexOf(".pdf") >= 1) {
- String textFile = String.format("%s%s%s%s%s.txt",
- pfile.getPath().substring(0,
- pfile.getPath().lastIndexOf(pfile.getName())),
- System.getProperty("file.separator"), "temp", System
- .getProperty("file.separator"), pfile.getName()
- .substring(0, pfile.getName().lastIndexOf(".pdf")));
- if (!new File(textFile.substring(0, textFile.lastIndexOf(new File(
- textFile).getName()))).exists()) {
- new File(textFile.substring(0, textFile.lastIndexOf(new File(
- textFile).getName()))).mkdirs();
- }
- PDDocument pdDoc = null;
- COSDocument cosDoc = null;
- try {
- pdDoc = PDDocument.load(pfile);
- PDFParser parser = new PDFParser(new FileInputStream(pfile));
- parser.parse();
- cosDoc = parser.getDocument();
- PDFTextStripper stripper = new PDFTextStripper();
- _content = stripper.getText(new PDDocument(cosDoc));
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- cosDoc.close();
- pdDoc.close();
- if (new File(textFile).exists()) {
- new File(textFile).delete();
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- return _content;
- }
- }
public class MIndexer { public void createIndex() { long start = System.currentTimeMillis(); try { // 获取Paoding中文分词器 Analyzer analyzer = new PaodingAnalyzer(); // indexWriter建立索引,E:\lucene\index建立索引的目录 IndexWriter writer = new IndexWriter("E:\\lucene\\index", analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED); //E:\lucene\data建立索引的数据,主要是.txt、.pdf文件 indexDocs(writer, new File("E:\\lucene\\data")); writer.optimize(); writer.close(); System.out.println("用时:" + (System.currentTimeMillis() - start) + " 毫秒"); } catch (IOException e) { e.printStackTrace(); } } // 遍历文件夹文件,对需要的文件建立索引 static void indexDocs(IndexWriter writer, File file) throws IOException { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { if (file.getName().endsWith(".htm") || file.getName().endsWith(".html") || file.getName().endsWith(".jsp") || file.getName().endsWith(".php") || file.getName().endsWith(".txt") || file.getName().endsWith(".pdf")) { try { // 针对参数文件建立索引文档 ,一个Document就相当于一跳记录 Document doc = new Document(); // Field.Index.ANALYZED 文件名称 建立索引,分词 doc.add(new Field("filename", file.getCanonicalPath(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); if(file.getName().endsWith(".pdf")){ doc.add(new Field("contents", pdf2txt(file), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); }else { doc.add(new Field("contents", ReadFile(file), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); } writer.addDocument(doc); } catch (FileNotFoundException fnfe) { ; } } } } } // 用字符串形式,读取一个File的内容 public static String ReadFile(File f) { String line = null; StringBuffer temp = new StringBuffer(); try { BufferedReader br = new BufferedReader(new InputStreamReader( new FileInputStream(f), "UTF-8")); while ((line = br.readLine()) != null) { temp.append(line); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return temp.toString(); } //若文件为pdf,就用这个读取 public static String pdf2txt(File pfile) { String _content = ""; if (pfile.exists() && pfile.getName().lastIndexOf(".pdf") >= 1) { String textFile = String.format("%s%s%s%s%s.txt", pfile.getPath().substring(0, pfile.getPath().lastIndexOf(pfile.getName())), System.getProperty("file.separator"), "temp", System .getProperty("file.separator"), pfile.getName() .substring(0, pfile.getName().lastIndexOf(".pdf"))); if (!new File(textFile.substring(0, textFile.lastIndexOf(new File( textFile).getName()))).exists()) { new File(textFile.substring(0, textFile.lastIndexOf(new File( textFile).getName()))).mkdirs(); } PDDocument pdDoc = null; COSDocument cosDoc = null; try { pdDoc = PDDocument.load(pfile); PDFParser parser = new PDFParser(new FileInputStream(pfile)); parser.parse(); cosDoc = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); _content = stripper.getText(new PDDocument(cosDoc)); } catch (IOException e) { e.printStackTrace(); } finally { try { cosDoc.close(); pdDoc.close(); if (new File(textFile).exists()) { new File(textFile).delete(); } } catch (IOException e) { e.printStackTrace(); } } } return _content; } }
MSearcher.java:搜索,返回符合条件的List
- public class MSearcher {
- public List<MBean> searchIndex(String keyword, boolean highlight,
- int content_length, int start, int length) {
- String indexpath = "E:\\lucene\\index"; // 索引所在目录
- List<MBean> mList = new ArrayList<MBean>();
- if (indexpath != null && new File(indexpath).exists()
- && keyword != null && !keyword.trim().equals("") && length > 0) {
- start = (start > 0) ? start : 1;
- String[] FIELD = { "filename", "contents" };
- // 获取Paoding中文分词器
- Analyzer analyzer = new PaodingAnalyzer();
- FSDirectory directory;
- IndexReader reader;
- Searcher searcher;
- try {
- directory = FSDirectory.getDirectory(indexpath);
- reader = IndexReader.open(directory);
- String queryString = keyword;
- /*
- * 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 SHOULD表示查询条件为or
- * MUST表示查询条件为and MUST_NOT表示查询条件为not
- */
- BooleanClause.Occur[] flags = new BooleanClause.Occur[] {
- BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };
- Query query = MultiFieldQueryParser.parse(queryString, FIELD,
- flags, analyzer);
- searcher = new IndexSearcher(directory);
- query = query.rewrite(reader);
- //分页,取出前start + length - 1条数据
- TopDocCollector collector = new TopDocCollector(start + length - 1);
- searcher.search(query, collector);
- ScoreDoc[] hits = collector.topDocs().scoreDocs;
- BoldFormatter formatter = new BoldFormatter();
- Highlighter highlighter = new Highlighter(formatter,
- new QueryScorer(query));
- highlighter.setTextFragmenter(new SimpleFragmenter(
- content_length));
- for (int i = start - 1; i < hits.length; i++) {
- MBean mBean = new MBean();
- Document doc = searcher.doc(hits[i].doc);
- String _filename = doc.get(FIELD[0]);
- String _contents = doc.get(FIELD[1]);
- int maxNumFragmentsRequired = 5;
- String fragmentSeparator = "...";
- TermPositionVector tpv_filename = (TermPositionVector) reader
- .getTermFreqVector(hits[i].doc, FIELD[0]);
- TermPositionVector tpv_contents = (TermPositionVector) reader
- .getTermFreqVector(hits[i].doc, FIELD[1]);
- String high_filename = "";
- String high_contents = "";
- if (tpv_filename != null) {
- TokenStream token_filename = TokenSources
- .getTokenStream(tpv_filename);
- high_filename = highlighter.getBestFragments(
- token_filename, _filename,
- maxNumFragmentsRequired, fragmentSeparator);
- }
- if (tpv_contents != null) {
- TokenStream token_contents = TokenSources
- .getTokenStream(tpv_contents);
- high_contents = highlighter.getBestFragments(
- token_contents, _contents,
- maxNumFragmentsRequired, fragmentSeparator);
- }
- mBean.setFilename((high_filename != null && !high_filename
- .equals("")) ? high_filename : _filename);
- mBean.setContents((high_contents != null && !high_contents
- .equals("")) ? high_contents
- : (_contents.length() > content_length ? _contents
- .substring(0, content_length) : _contents));
- mList.add(mBean);
- }
- searcher.close();
- reader.close();
- } catch (ParseException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- return mList;
- }
- public Integer searchIndexLength(String keyword, boolean highlight,
- int content_length, int start, int length, int maxLength) {
- int _count = 0;
- String indexpath = "E:\\lucene\\index";
- if (indexpath != null && new File(indexpath).exists()
- && keyword != null && !keyword.trim().equals("") && length > 0) {
- start = (start > 0) ? start : 1;
- String[] FIELD = { "filename", "contents" };
- Analyzer analyzer = new PaodingAnalyzer();
- FSDirectory directory;
- IndexReader reader;
- Searcher searcher;
- try {
- directory = FSDirectory.getDirectory(indexpath);
- reader = IndexReader.open(directory);
- String queryString = keyword;
- BooleanClause.Occur[] flags = new BooleanClause.Occur[] {
- BooleanClause.Occur.SHOULD,
- BooleanClause.Occur.SHOULD };
- Query query = MultiFieldQueryParser.parse(queryString, FIELD,
- flags, analyzer);
- searcher = new IndexSearcher(reader);
- query = query.rewrite(reader);
- TopDocCollector collector = new TopDocCollector(maxLength);
- searcher.search(query, collector);
- ScoreDoc[] hits = collector.topDocs().scoreDocs;
- _count = hits.length;
- searcher.close();
- reader.close();
- } catch (ParseException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- return _count;
- }
- }
public class MSearcher { public List<MBean> searchIndex(String keyword, boolean highlight, int content_length, int start, int length) { String indexpath = "E:\\lucene\\index"; // 索引所在目录 List<MBean> mList = new ArrayList<MBean>(); if (indexpath != null && new File(indexpath).exists() && keyword != null && !keyword.trim().equals("") && length > 0) { start = (start > 0) ? start : 1; String[] FIELD = { "filename", "contents" }; // 获取Paoding中文分词器 Analyzer analyzer = new PaodingAnalyzer(); FSDirectory directory; IndexReader reader; Searcher searcher; try { directory = FSDirectory.getDirectory(indexpath); reader = IndexReader.open(directory); String queryString = keyword; /* * 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 SHOULD表示查询条件为or * MUST表示查询条件为and MUST_NOT表示查询条件为not */ BooleanClause.Occur[] flags = new BooleanClause.Occur[] { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD }; Query query = MultiFieldQueryParser.parse(queryString, FIELD, flags, analyzer); searcher = new IndexSearcher(directory); query = query.rewrite(reader); //分页,取出前start + length - 1条数据 TopDocCollector collector = new TopDocCollector(start + length - 1); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; BoldFormatter formatter = new BoldFormatter(); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter( content_length)); for (int i = start - 1; i < hits.length; i++) { MBean mBean = new MBean(); Document doc = searcher.doc(hits[i].doc); String _filename = doc.get(FIELD[0]); String _contents = doc.get(FIELD[1]); int maxNumFragmentsRequired = 5; String fragmentSeparator = "..."; TermPositionVector tpv_filename = (TermPositionVector) reader .getTermFreqVector(hits[i].doc, FIELD[0]); TermPositionVector tpv_contents = (TermPositionVector) reader .getTermFreqVector(hits[i].doc, FIELD[1]); String high_filename = ""; String high_contents = ""; if (tpv_filename != null) { TokenStream token_filename = TokenSources .getTokenStream(tpv_filename); high_filename = highlighter.getBestFragments( token_filename, _filename, maxNumFragmentsRequired, fragmentSeparator); } if (tpv_contents != null) { TokenStream token_contents = TokenSources .getTokenStream(tpv_contents); high_contents = highlighter.getBestFragments( token_contents, _contents, maxNumFragmentsRequired, fragmentSeparator); } mBean.setFilename((high_filename != null && !high_filename .equals("")) ? high_filename : _filename); mBean.setContents((high_contents != null && !high_contents .equals("")) ? high_contents : (_contents.length() > content_length ? _contents .substring(0, content_length) : _contents)); mList.add(mBean); } searcher.close(); reader.close(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } return mList; } public Integer searchIndexLength(String keyword, boolean highlight, int content_length, int start, int length, int maxLength) { int _count = 0; String indexpath = "E:\\lucene\\index"; if (indexpath != null && new File(indexpath).exists() && keyword != null && !keyword.trim().equals("") && length > 0) { start = (start > 0) ? start : 1; String[] FIELD = { "filename", "contents" }; Analyzer analyzer = new PaodingAnalyzer(); FSDirectory directory; IndexReader reader; Searcher searcher; try { directory = FSDirectory.getDirectory(indexpath); reader = IndexReader.open(directory); String queryString = keyword; BooleanClause.Occur[] flags = new BooleanClause.Occur[] { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD }; Query query = MultiFieldQueryParser.parse(queryString, FIELD, flags, analyzer); searcher = new IndexSearcher(reader); query = query.rewrite(reader); TopDocCollector collector = new TopDocCollector(maxLength); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; _count = hits.length; searcher.close(); reader.close(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } return _count; } }
Search.java:处理用户请求的Servlet
- public class Search extends HttpServlet {
- private static final Integer NUMBER = 10;//每页显示10条
- private static final Integer CONTENT_LENGTH = 50;
- private static final Boolean HIGHLIGHT = true;
- private static final long serialVersionUID = 1L;
- private MSearcher mSearcher = new MSearcher();
- @Override
- public void doPost(HttpServletRequest request, HttpServletResponse response)
- throws ServletException, IOException {
- request.setCharacterEncoding("UTF-8");
- String q = request.getParameter("q") != null ? request
- .getParameter("q").trim() : request.getParameter("q");
- System.out.println("----"+q);
- List<MBean> mList = new ArrayList<MBean>();
- List<PBean> pList = new ArrayList<PBean>();
- int start = request.getParameter("start")!= null ? Integer
- .valueOf(request.getParameter("start"))
- : 0;
- int all_count = 0;
- all_count = mSearcher.searchIndexLength( q, HIGHLIGHT,
- CONTENT_LENGTH, start, NUMBER, NUMBER * 1000);
- mList = mSearcher.searchIndex( q, HIGHLIGHT,
- CONTENT_LENGTH, start, NUMBER);
- pList = getPageList(all_count, start);
- if (start > NUMBER) {
- request.setAttribute("previous", start - NUMBER);
- }
- if (start < all_count - NUMBER) {
- request.setAttribute("next", NUMBER + (start != 0 ? start : 1));
- }
- request.setAttribute("q", q);
- request.setAttribute("start", start);
- request.setAttribute("pList", pList);
- request.setAttribute("mList", mList.isEmpty() ? null : mList);
- request.getRequestDispatcher("/index.jsp").forward(request, response);
- }
- @Override
- public void doGet(HttpServletRequest request, HttpServletResponse response)
- throws ServletException, IOException {
- doPost(request, response);
- }
- private static List<PBean> getPageList(int all_count, int start) {
- MIndexer mIndexer = new MIndexer();
- mIndexer.createIndex();
- List<PBean> pList = new ArrayList<PBean>();
- int all_page = (all_count <= 0) ? 1 : (all_count / NUMBER + (all_count
- % NUMBER > 0 ? 1 : 0));
- int now_page = (start <= 0) ? 1
- : (start / NUMBER + (start % NUMBER > 0 ? 1 : 0));
- for (int i = (now_page - 10 > 0 ? now_page - 10 : 1); i <= (((now_page + 9) <= all_page) ? (now_page + 9)
- : all_page); i++) {
- PBean pBean = new PBean();
- pBean.setPage(i);
- pBean.setStart((pBean.getPage() - 1) * NUMBER + 1);
- pList.add(pBean);
- }
- return pList;
- }
- }
public class Search extends HttpServlet { private static final Integer NUMBER = 10;//每页显示10条 private static final Integer CONTENT_LENGTH = 50; private static final Boolean HIGHLIGHT = true; private static final long serialVersionUID = 1L; private MSearcher mSearcher = new MSearcher(); @Override public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { request.setCharacterEncoding("UTF-8"); String q = request.getParameter("q") != null ? request .getParameter("q").trim() : request.getParameter("q"); System.out.println("----"+q); List<MBean> mList = new ArrayList<MBean>(); List<PBean> pList = new ArrayList<PBean>(); int start = request.getParameter("start")!= null ? Integer .valueOf(request.getParameter("start")) : 0; int all_count = 0; all_count = mSearcher.searchIndexLength( q, HIGHLIGHT, CONTENT_LENGTH, start, NUMBER, NUMBER * 1000); mList = mSearcher.searchIndex( q, HIGHLIGHT, CONTENT_LENGTH, start, NUMBER); pList = getPageList(all_count, start); if (start > NUMBER) { request.setAttribute("previous", start - NUMBER); } if (start < all_count - NUMBER) { request.setAttribute("next", NUMBER + (start != 0 ? start : 1)); } request.setAttribute("q", q); request.setAttribute("start", start); request.setAttribute("pList", pList); request.setAttribute("mList", mList.isEmpty() ? null : mList); request.getRequestDispatcher("/index.jsp").forward(request, response); } @Override public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { doPost(request, response); } private static List<PBean> getPageList(int all_count, int start) { MIndexer mIndexer = new MIndexer(); mIndexer.createIndex(); List<PBean> pList = new ArrayList<PBean>(); int all_page = (all_count <= 0) ? 1 : (all_count / NUMBER + (all_count % NUMBER > 0 ? 1 : 0)); int now_page = (start <= 0) ? 1 : (start / NUMBER + (start % NUMBER > 0 ? 1 : 0)); for (int i = (now_page - 10 > 0 ? now_page - 10 : 1); i <= (((now_page + 9) <= all_page) ? (now_page + 9) : all_page); i++) { PBean pBean = new PBean(); pBean.setPage(i); pBean.setStart((pBean.getPage() - 1) * NUMBER + 1); pList.add(pBean); } return pList; } }