lucene实现自定义的评分 - 学习笔记 - 博客频道 - CSDN.NET
Lucene按一个或多个字段进行排序是基本的功能,但可能需要更高级的自定义排序功能,并通过调整得分。Lucene自定义排序调整打分方法,有下面几种:
1、在索引阶段设置Document Boost和Field Boost,提升文档或字段的排名,例如:
Document doc1 = new Document();
Field f1 = new Field("contents", "common hello hello", Field.Store.NO, Field.Index.ANALYZED);
doc1.add(f1);
doc1.setBoost(100);
writer.addDocument(doc1);
Document doc1 = new Document();
Field f1 = new Field("title", "common hello hello", Field.Store.NO, Field.Index.ANALYZED);
f1.setBoost(100);
doc1.add(f1);
writer.addDocument(doc1);
2、通过继承并实现自己的Similarity,覆盖方法float scorePayload(int docId, String fieldName, int start, int end, byte [] payload, int offset, int length)
class PayloadSimilarity extends DefaultSimilarity {
@Override
public float scorePayload(int docId, String fieldName, int start, int end, byte[] payload, int offset, int length) {
int isbold = BoldFilter.bytes2int(payload);
if(isbold == BoldFilter.IS_BOLD){
System.out.println("It is a bold char.");
return 10;
} else {
System.out.println("It is not a bold char.");
return 1;
}
}
}
4、继承并实现自定义CustomScoreProvider和CustomScoreQuery,对评分进行干预,影响排名排序,例如:
- package util;
- import java.io.IOException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.function.CustomScoreProvider;
- import org.apache.lucene.search.function.CustomScoreQuery;
- import org.apache.lucene.search.function.FieldScoreQuery;
- import org.apache.lucene.search.function.ValueSourceQuery;
- import org.apache.lucene.search.function.FieldScoreQuery.Type;
- public class MyScoreQuery1{
- public void searchByScoreQuery() throws Exception{
- IndexSearcher searcher = DocUtil.getSearcher();
- Query query = new TermQuery(new Term("content","java"));
- //1、创建评分域,如果Type是String类型,那么是Type.BYTE
- //该域必须是数值型的,并且不能使用norms索引,以及每个文档中该域只能由一个语汇
- //单元,通常可用Field.Index.not_analyzer_no_norms来进行创建索引
- FieldScoreQuery fieldScoreQuery = new FieldScoreQuery("size",Type.INT);
- //2、根据评分域和原有的Query创建自定义的Query对象
- //query是原有的query,fieldScoreQuery是专门做评分的query
- MyCustomScoreQuery customQuery = new MyCustomScoreQuery(query, fieldScoreQuery);
- TopDocs topdoc = searcher.search(customQuery, 100);
- DocUtil.printDocument(topdoc, searcher);
- searcher.close();
- }
- @SuppressWarnings("serial")
- private class MyCustomScoreQuery extends CustomScoreQuery{
- public MyCustomScoreQuery(Query subQuery, ValueSourceQuery valSrcQuery) {
- super(subQuery, valSrcQuery);
- }
- /**
- * 这里的reader是针对段的,意思是如果索引包含的段不止一个,那么搜索期间会多次调用
- * 这个方法,强调这点是重要的,因为它使你的评分逻辑能够有效使用段reader来对域缓存
- * 中的值进行检索
- */
- @Override
- protected CustomScoreProvider getCustomScoreProvider(IndexReader reader)
- throws IOException {
- //默认情况实现的评分是通过原有的评分*传入进来的评分域所获取的评分来确定最终打分的
- //为了根据不同的需求进行评分,需要自己进行评分的设定
- /**
- * 自定评分的步骤
- * 创建一个类继承于CustomScoreProvider
- * 覆盖customScore方法
- */
- // return super.getCustomScoreProvider(reader);
- return new MyCustomScoreProvider(reader);
- }
- }
- private class MyCustomScoreProvider extends CustomScoreProvider{
- public MyCustomScoreProvider(IndexReader reader) {
- super(reader);
- }
- /**
- * subQueryScore表示默认文档的打分
- * valSrcScore表示的评分域的打分
- * 默认是subQueryScore*valSrcScore返回的
- */
- @Override
- public float customScore(int doc, float subQueryScore, float valSrcScore)throws IOException {
- System.out.println("Doc:"+doc);
- System.out.println("subQueryScore:"+subQueryScore);
- System.out.println("valSrcScore:"+valSrcScore);
- // return super.customScore(doc, subQueryScore, valSrcScore);
- return subQueryScore / valSrcScore;
- }
- }
- }
根据特定的几个文件名来评分,选中的文件名权重变大
- package util;
- import java.io.IOException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.FieldCache;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.function.CustomScoreProvider;
- import org.apache.lucene.search.function.CustomScoreQuery;
- /**
- * 此类的功能是给特定的文件名加权,也就是加评分
- * 也可以实现搜索书籍的时候把近一两年的出版的图书给增加权重
- * @author user
- */
- public class MyScoreQuery2 {
- public void searchByFileScoreQuery() throws Exception{
- IndexSearcher searcher = DocUtil.getSearcher();
- Query query = new TermQuery(new Term("content","java"));
- FilenameScoreQuery fieldScoreQuery = new FilenameScoreQuery(query);
- TopDocs topdoc = searcher.search(fieldScoreQuery, 100);
- DocUtil.printDocument(topdoc, searcher);
- searcher.close();
- }
- @SuppressWarnings("serial")
- private class FilenameScoreQuery extends CustomScoreQuery{
- public FilenameScoreQuery(Query subQuery) {
- super(subQuery);
- }
- @Override
- protected CustomScoreProvider getCustomScoreProvider(IndexReader reader)
- throws IOException {
- // return super.getCustomScoreProvider(reader);
- return new FilenameScoreProvider(reader);
- }
- }
- private class FilenameScoreProvider extends CustomScoreProvider{
- String[] filenames = null;
- public FilenameScoreProvider(IndexReader reader) {
- super(reader);
- try {
- filenames = FieldCache.DEFAULT.getStrings(reader, "filename");
- } catch (IOException e) {e.printStackTrace();}
- }
- //如何根据doc获取相应的field的值
- /*
- * 在reader没有关闭之前,所有的数据会存储要一个域缓存中,可以通过域缓存获取很多有用
- * 的信息filenames = FieldCache.DEFAULT.getStrings(reader, "filename");可以获取
- * 所有的filename域的信息
- */
- @Override
- public float customScore(int doc, float subQueryScore, float valSrcScore)
- throws IOException {
- String fileName = filenames[doc];
- System.out.println(doc+":"+fileName);
- // return super.customScore(doc, subQueryScore, valSrcScore);
- if("9.txt".equals(fileName) || "4.txt".equals(fileName)) {
- return subQueryScore*1.5f;
- }
- return subQueryScore/1.5f;
- }
- }
- }