lucene Direction类 和 term查询
一 、 Directory介绍
1、使用FSDirectory.open(java.io.File)方法,会根据当前的运行环境打开一个最合理的基于File的Directory。
SimpleFSDirectory : 使用RandomAccessFile类访问文件,但是在并发上面效率不是很高
NIOFSDirectory : 使用java.nio.FileChannel,能够提高并发访问效率
MMapDirectory : 会根据操作系统64bit或者32bit创建合适的directory对象
2、new RAMDirectory会从内存中打开directory,好处是速度快,缺点是无法持久化
二、Term查询
1、编写各种查询类
package com.hb.lucence;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.QueryParser.Operator;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
public class SearchUtil {
private Directory directory;
private IndexReader reader;
private String[] ids = { "1", "2", "3", "4", "5", "6" };
private String[] emails = { "[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]" };
private String[] contents = { "welcome to visited the space,I like book", "hello boy, I like pingpeng ball", "my name is cc I like game", "I like football", "I like football and I like basketball too", "I like movie and swim" };
private String[] names = { "zhangsan", "lisi", "john", "jetty", "mike", "jake" };
private Map<String, Float> scores = new HashMap<String, Float>();
public SearchUtil() {
scores.put("itat.org", 2.0f);
scores.put("zttc.edu", 1.5f);
directory = new RAMDirectory();
this.index();
}
public void index() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
writer.deleteAll();
Document doc = null;
for (int i = 0; i < ids.length; i++) {
doc = new Document();
doc.add(new Field("id", ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("email", emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("email", "test" + i + "@test.com", Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("name", names[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
String et = emails[i].substring(emails[i].lastIndexOf("@") + 1);
System.out.println(et);
if (scores.containsKey(et)) {
//配置索引的加权值
doc.setBoost(scores.get(et));
} else {
doc.setBoost(0.5f);
}
writer.addDocument(doc);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (writer != null)
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public IndexSearcher getSearcher() {
try {
if (reader == null) {
this.reader = IndexReader.open(directory);
} else {
IndexReader tr = IndexReader.openIfChanged(reader);
if (tr != null) {
reader.clone();
reader = tr;
}
}
return new IndexSearcher(reader);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public void searchByTerm(String field, String name, int num) {
try {
IndexSearcher searcher = this.getSearcher();
Query query = new TermQuery(new Term(field, name));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了 :" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id") + "---->" + doc.get("name") + "[" + doc.get("email") + "]-->" + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date"));
}
// 关闭IndexSearcher对象
searcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public void searchByTermRange(String field, String start, String end, int num) {
TopDocs tds;
try {
IndexSearcher searcher = getSearcher();
/**
* String field, 索引对应的域 String lowerTerm, 起始查询条件 String upperTerm,
* boolean includeLower, boo lean includeUpper
*/
Query query = new TermRangeQuery(field, start, end, true, true);
tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id") + "---->" + doc.get("name") + "[" + doc.get("email") + "]-->" + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date"));
}
searcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public void searchByNumricRange(String field, int start, int end, int num) {
TopDocs tds;
try {
IndexSearcher searcher = this.getSearcher();
Query query = NumericRangeQuery.newIntRange(field, start, end, true, true);
tds = searcher.search(query, 10);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id") + "---->" + doc.get("name") + "[" + doc.get("email") + "]-->" + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date"));
}
searcher.close();
} catch (IOException e) {
}
}
public void searchByPrefix(String field, String value, int num) {
try {
IndexSearcher searcher = this.getSearcher();
Query query = new PrefixQuery(new Term(field, value));
TopDocs tds = searcher.search(query, num);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id") + "---->" + doc.get("name") + "[" + doc.get("email") + "]-->" + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date"));
}
searcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public void searchByWildcard(String field, String value, int num) {
try {
IndexSearcher searcher = this.getSearcher();
// 在传入的value中可以使用通配符:?和*,?表示匹配一个字符,*表示匹配任意多个字符
Query query = new WildcardQuery(new Term(field, value));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了 : " + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id") + "---->" + doc.get("name") + "[" + doc.get("email") + "]-->" + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date"));
}
searcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public void searchByFuzzy(String field, String value, int num) {
try {
IndexSearcher searcher = this.getSearcher();
//
Query query = new FuzzyQuery(new Term(field, value));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了 : " + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id") + "---->" + doc.get("name") + "[" + doc.get("email") + "]-->" + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date"));
}
searcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public void searchByBoolean(int num) {
try {
IndexSearcher searcher = this.getSearcher();
BooleanQuery query = new BooleanQuery();
/*
* BooleanQuery可以连接多个子查询 Occur.MUST表示必须出现 Occur.SHOULD表示可以出现
* Occur.MUSE_NOT表示不能出现
*/
TermQuery termQuery1 = new TermQuery(new Term("name", "zhangsan"));
query.add(termQuery1, Occur.MUST_NOT);
TermQuery termQuery2 = new TermQuery(new Term("content", "game"));
query.add(termQuery2, Occur.SHOULD);
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id") + "---->" + doc.get("name") + "[" + doc.get("email") + "]-->" + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date"));
}
searcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
NumericRange :
数字范围查询 。
//是否包含开始,是否包含结束
Query query = NumericRangeQuery.newIntRange (field, start, end, true, true);
su.searchByNumricRange("attach", 2, 10, 5);
PrefixQuery :
通过字符串前缀来查询。
Query query = new PrefixQuery(new Term(field, value));
//查询以sex开头的
su.searchByPrefix("content", "sex", 10);
WildcardQuery :
通配符查询。
//在传入的value中可以使用通配符:?和*,?表示匹配一个字符,*表示匹配任意多个字符
Query query = new WildcardQuery(new Term(field, value));
//匹配@itat.org结尾的所有字符
su.searchByWildcard("email", "*@itat.org", 10);
//匹配j开头的有三个字符的name
su.searchByWildcard("name", "j???", 10);
BooleanQuery :
用于表示布尔查询子句关系的类,包括:BooleanClause.Occur.MUST,BooleanClause.Occur.MUST_NOT,BooleanClause.Occur.SHOULD。
必须包含,不能包含,可以包含三种.有以下6种组合 :
1.MUST和MUST:取得连个查询子句的交集。
2.MUST和MUST_NOT:表示查询结果中不能包含MUST_NOT所对应的查询子句的检索结果。
3.SHOULD与MUST_NOT:连用时,功能同MUST和MUST_NOT。
4.SHOULD与MUST连用时,结果为MUST子句的检索结果,但是SHOULD可影响排序。
5.SHOULD与SHOULD:表示“或”关系,最终检索结果为所有检索子句的并集。
6.MUST_NOT和MUST_NOT:无意义,检索无结果。
BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("name", "zhangsan")), Occur.MUST);
query.add(new TermQuery(new Term("content", "welcome")), Occur.MUST_NOT);
PhraseQuery :
短语查询。
query.add(new Term("content", "pingpeng"));
query.add(new Term("content", "i"));
query.setSlop(3);//要求结果中不仅包含上面的term,并且两个Term之间的间隔不能超过3
FuzzyQuery :
模糊查询,在FuzzyQuery类定义中定义了两个成员变量:
private float minimumSimilarity;
private int prefixLength;
minimumSimilarity是最小相似度,取值范围为0.0~1.0,包含0.0但不包含1.0,默认值为0.5。prefixLength是前缀长度,默认为0。
minimumSimilarity表示是最小相似度,可以通过指定一个相似度来决定模糊匹配的严格程度。默认为0.5, 当这个值越小,通过模糊查找出的文档的匹配程度就越低,
文档的数量也就越多;当这个值越大,说明要匹配程度更大,匹配的文档数也就越少,当相似度设置为1,那么就退化为TermQuery查询,所以当这个值>=1或<0会抛出IllegalArgumentException异常。
另外一个参数prefixLength表示在进行模糊匹配的时候,要有多少个前缀字母必须完全匹配。 例如当该值设置为“1”,则表示所有此条只有第一个字母与检索关键字相符时,才会被集合选中。
FuzzyQuery query = new FuzzyQuery(new Term("name", "zhang"), 0.1f, 0);
2、使用junit测试各个方法
import java.io.File;
import java.util.Collection;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFileFilter;
import org.apache.lucene.search.NumericRangeQuery;
import org.junit.Test;
import com.hb.lucence.SearchUtil;
public class SearchUtilTest {
@Test
public void searchByTermTest(){
SearchUtil su = new SearchUtil();
//查询id=1的document
su.searchByTerm("id", "1", 10);
}
@Test
public void searchTermRangeTest(){
SearchUtil su = new SearchUtil();
//查询name以a开头和s结尾的
// su.searchByTermRange("name","a","s",10);
//由于attachs是数字类型,使用TermRange无法查询
su.searchByTermRange("id", "2", "4", 10);
}
@Test
public void numericRangeQueryTest(){
SearchUtil su = new SearchUtil();
//数字查询
su.searchByNumricRange("attach", 2, 4, 10);
}
@Test
public void prefixTest(){
SearchUtil su = new SearchUtil();
//查询email域以b开头的
su.searchByPrefix("email", "b", 10);
}
@Test
public void wildcardTest(){
SearchUtil su = new SearchUtil();
//查询email字段a开头的所有信息
su.searchByWildcard("email", "a*", 10);
}
@Test
public void fuzzyTest(){
SearchUtil su = new SearchUtil();
//查询name字段与jaee相差一个字符的,例如jake
su.searchByFuzzy("name", "jaee", 10);
}
}
已有 0 人发表留言,猛击->> 这里<<-参与讨论
ITeye推荐