所谓分组统计,就是类似sql里group by的功能。在solr里,这个功能称为faceting。lucene本身不支持分组统计,不过可以使用fieldCache来实现分组统计功能,而且也有很好的性能。solr根据不同的情况,还提供了其他方法(filterCache和UnInvertedField)来实现,这个以后再说。
fieldCache是lucene用来排序的缓存。对要用来排序的字段,lucene会从索引中将每篇文档该字段的值都读出来,放到一个大小为maxDoc的数组中。maxDoc是lucene内部文档编号的最大值。有两点需要注意一下:
- fieldCache中的字段值是从倒排表中读出来的,而不是从索引文件中存储的字段值,所以排序的字段必须是为设为索引字段
- 用来排序的字段在索引的时候不能拆分(tokenized),因为fieldCache数组中,每个文档只对应一个字段值,拆分的话,cache中只会保存在词典中靠后的值。
fieldcache是lucene最占用的内存的部分,大部分内存溢出的错误都是由它而起,需要特别注意。
分组统计可以借用fieldCache来高效率的实现。调用lucene进行查询,通过读取倒排表并进行boolean运算,得到一个满足条件的文档的集合。通过每个结果文档号读取fieldCache数组中的值,并分不同的值累加数目,即可实现分组统计的功能。其中,如果某个字段对应多值,则在索引的时候不拆分,从filedCache数组读出后,再进行拆分统计。
好了,说了半天,现在来看看实现代码:Test.java
import java.io.IOException;
import java.util.List;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.HBxx2Similarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class Test {
String path = "D:\\index";
Version version = Version.LUCENE_29;
@SuppressWarnings("deprecation")
public void search(int pageNO, int pageSize) throws ParseException {
try {
Long start1 = System.currentTimeMillis();
int start = (pageNO - 1) * pageSize;
int topCount = pageSize * pageNO;
IndexReader reader = IndexReader.open(FSDirectory
.getDirectory(path));
Searcher searcher = new IndexSearcher(reader);
TopDocsCollector collector = TopScoreDocCollector.create(topCount,
false);
// 读取"fenlei"字段值,放到fieldCache中
final String[] fc = FieldCache.DEFAULT.getStrings(reader, "fenlei");
// GroupCollector是自定义文档收集器,用于实现分组统计
GroupCollector groupCollector = new GroupCollector(collector, fc);
searcher.search(new MatchAllDocsQuery(), groupCollector);
// GroupField用来保存分组统计的结果
GroupField gf = groupCollector.getGroupField();
System.out.println("分组信息");
List<String> values = gf.getValues();
for (String value : values) {
System.out.println(value + "=" + gf.getCountMap().get(value));
}
// 搜索结果总数
int totalHits = collector.getTotalHits();
System.out.println("总数:" + totalHits);
System.out.println("分页结果");
// 获取分页后搜索结果
ScoreDoc[] scoreDocs = collector.topDocs(start, pageSize).scoreDocs;
for (int i = 0; i < scoreDocs.length; i++) {
int docId = scoreDocs[i].doc;
Document doc = reader.document(docId);
System.out.println("id:" + doc.get("id") + " fenlei:"
+ doc.get("fenlei") + " title:" + doc.get("title"));
}
Long time = System.currentTimeMillis() - start1;
System.out.println("搜索所用时间为:" + time + "毫秒");
} catch (IOException e) {
e.printStackTrace();
}
}
@SuppressWarnings("deprecation")
public void WriteIndex() throws CorruptIndexException,
LockObtainFailedException, IOException {
Long start = System.currentTimeMillis();
// 分词器
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
new StandardAnalyzer(version));
analyzer.addAnalyzer("title", new MMAnalyzer());// 中文分词器
IndexWriter writer = new IndexWriter(FSDirectory.getDirectory(path),
analyzer, MaxFieldLength.LIMITED);
writer.setSimilarity(new HBxx2Similarity());
writer.setMaxBufferedDocs(2048);
writer.setRAMBufferSizeMB(256);
int count = 0;
String title = "中国人民 测试数据";
String fenlei = "分类";
// 开始读取数据创建索引
int max = 1000000;
int groupMax = 75000;
for (int i = 0; i < max; i++) {
if (i % groupMax == 0) {
count++;
System.out.println(i);
}
Document document = new Document();
Field idField = new Field("id", Integer.toString(i + 1), Store.YES,
Index.NOT_ANALYZED);
Field titleField = new Field("title", title + (i + 1), Store.YES,
Index.ANALYZED);
Field fenleiField = new Field("fenlei", fenlei + count, Store.YES,
Index.NOT_ANALYZED);
document.add(idField);
document.add(titleField);
document.add(fenleiField);
writer.addDocument(document);
}
writer.commit();
writer.optimize();
writer.close();
Long time = System.currentTimeMillis() - start;
System.out.println("创建索引所用时间为:" + time + "毫秒");
}
public static void main(String[] args) throws CorruptIndexException,
IOException, ParseException {
Test test = new Test();
// 建立索引
// test.WriteIndex();
// 搜索索引
int pageNO = 100, pageSize = 20;
test.search(pageNO, pageSize);
}
}
GroupField.java
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 用于保存分组统计后每个字段的分组结果
*/
public class GroupField {
/**
* 字段名
*/
private String name;
/**
* 所有可能的分组字段值,排序按每个字段值的文档个数大小排序
*/
private List<String> values = new ArrayList<String>();
/**
* 保存字段值和文档个数的对应关系
*/
private Map<String, Integer> countMap = new HashMap<String, Integer>();
public Map<String, Integer> getCountMap() {
return countMap;
}
public void setCountMap(Map<String, Integer> countMap) {
this.countMap = countMap;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public List<String> getValues() {
Collections.sort(values, new ValueComparator());
return values;
}
public void setValues(List<String> values) {
this.values = values;
}
public void addValue(String value) {
if (value == null || "".equals(value))
return;
// 对于多值的字段,支持按空格拆分
String[] temp = value.split(" ");
for (String str : temp) {
if (countMap.get(str) == null) {
countMap.put(str, 1);
values.add(str);
} else {
countMap.put(str, countMap.get(str) + 1);
}
}
}
class ValueComparator implements Comparator<String> {
public int compare(String value0, String value1) {
if (countMap.get(value0) > countMap.get(value1)) {
return -1;
} else if (countMap.get(value0) < countMap.get(value1)) {
return 1;
}
return 0;
}
}
}
GroupCollector.java
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TopDocsCollector;
public class GroupCollector extends TopDocsCollector {
Collector collector;
int docBase;
private String[] fc; // fieldCache
private GroupField gf = new GroupField();// 保存分组统计结果
GroupCollector(Collector topDocsCollector, String[] fieldCache)
throws IOException {
super(null);
collector = topDocsCollector;
this.fc = fieldCache;
}
@Override
public void collect(int doc) throws IOException {
collector.collect(doc);
// 因为doc是每个segment的文档编号,需要加上docBase才是总的文档编号
int docId = doc + docBase;
// 添加的GroupField中,由GroupField负责统计每个不同值的数目
gf.addValue(fc[docId]);
}
@Override
public void setNextReader(IndexReader reader, int docBase)
throws IOException {
collector.setNextReader(reader, docBase);
this.docBase = docBase;
}
@Override
public void setScorer(Scorer scorer) throws IOException {
collector.setScorer(scorer);
}
@Override
public boolean acceptsDocsOutOfOrder() {
return collector.acceptsDocsOutOfOrder();
}
public void setFc(String[] fc) {
this.fc = fc;
}
public GroupField getGroupField() {
return gf;
}
}
阅读全文……