Lucene过滤器 - baobeituping - ITeye技术网站
有的应用有些要求,对于某类型的内容即使满足条件了,但是也不能被搜索出来,lucene中提供了过滤器的功能,通过自定义的过滤器继承Filter,从而实现特定的过滤功能。
Filter是一种过滤行为BitSet是一种位集合队列,这个队列中只有两种取值,TRUE或FALSE,LUCENE以这两种取值代表文档是否被过滤,也就是说,LUCENE返回结果时,会首先遍历BITSET,仅将那些对应值为TRUE的文档返回。
过滤器:
package com.filter;
import java.io.IOException;
import java.util.BitSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.Filter;
public class AdvancedSecurityFilter extends Filter {
//安全级别的常量
public static final int ADVANCED=0;
@Override
public BitSet bits(IndexReader reader) throws IOException {
//首先初始化一个BITSET对象
final BitSet bits = new BitSet(reader.maxDoc());
//先将整个集合设置为TRUE,表示当前集合内的所有文档都是可以被检索到的。
bits.set(0,bits.size()-1);
//构造一个TERM对象,代表最高安全级别
Term term = new Term("securitylevel",ADVANCED+"");
//从索引中搜索出所有最高安全级别的文档
TermDocs termDocs = reader.termDocs(term);
//遍历每个文档,并将其
while(termDocs.next())
{
bits.set(termDocs.doc(),false);
}
return bits;
}
}
过滤器使用实例:
public class FilterDemo {
/**
* @param args
*/
public static final int ADVANCED=0;
public static final int MIDDLE =1;
public static final int NORMAL=2;
public static void main(String[] args) {
try {
/*File file = new File("d://demo");
Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter(file, luceneAnalyzer, false);
Document doc1 = new Document();
Field f1 = new Field("bookNumber","0003",Field.Store.YES,Field.Index.UN_TOKENIZED);
Field f2 = new Field("bookName","非对称模型",Field.Store.YES,Field.Index.UN_TOKENIZED);
Field f3 = new Field("securitylevel",ADVANCED+"",Field.Store.YES,Field.Index.UN_TOKENIZED);
doc1.add(f1);
doc1.add(f2);
doc1.add(f3);
Document doc2 = new Document();
Field f4 = new Field("bookNumber","0001",Field.Store.YES,Field.Index.UN_TOKENIZED);
Field f5 = new Field("bookName","钢铁战士",Field.Store.YES,Field.Index.TOKENIZED);
Field f6 = new Field("securitylevel",MIDDLE+"",Field.Store.YES,Field.Index.UN_TOKENIZED);
doc2.add(f4);
doc2.add(f5);
doc2.add(f6);
Document doc3 = new Document();
Field f7 = new Field("bookNumber","0004",Field.Store.YES,Field.Index.UN_TOKENIZED);
Field f8 = new Field("bookName","黑猫警长",Field.Store.YES,Field.Index.TOKENIZED);
Field f9 = new Field("securitylevel",NORMAL+"",Field.Store.YES,Field.Index.UN_TOKENIZED);
doc3.add(f7);
doc3.add(f8);
doc3.add(f9);
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.addDocument(doc3);
writer.setUseCompoundFile(true);
writer.optimize();
writer.close();*/
Term begin = new Term("bookNumber","0001");
Term end = new Term("bookNumber","0004");
RangeQuery q = new RangeQuery(begin,end,true);
IndexSearcher searcher = new IndexSearcher("d://demo");
System.out.println(q.toString());
//通过将自定义的过滤器配置在search方法中,从而达到过滤的目的。
Hits hits = searcher.search(q,new AdvancedSecurityFilter());
for(int i=0;i<hits.length();i++)
{
System.out.println(hits.doc(i));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
rank/ITEYEBlogSimilarChecker.java at master · ysc/rank · GitHub
我们如何应对这样的商业广告呢?基本思路如下:
1、当管理员发现一篇博文为黑博文时,人工确认。
2、将人工确认的黑博文保存到黑博文数据库。
3、当有新博文发表时,和黑博文数据库进行相似度计算,如果相似度超过预设的阈值,则拒绝发表博文。
下面是黑博文判断程序的详细判断过程,先上最终结果:
判定相似性的方式一:简单共有词
阈值=Math.min(339, 340)*0.8=271.2
待发表博文和黑博文共有的词数:339
因为待发表博文和黑博文共有的词数339 大于 阈值:271.2
所以判断为 相似 ,拒绝发表!
判定相似性的方式二:余弦相似度
待发表博文和黑博文的余弦夹角值:0.9977658868305056
因为待发表博文和黑博文的余弦夹角值0.9977658868305056大于或等于阈值:0.8
所以判断为 相似 ,拒绝发表!
/** | |
* | |
* APDPlat - Application Product Development Platform | |
* Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com | |
* | |
* This program is free software: you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation, either version 3 of the License, or | |
* (at your option) any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with this program. If not, see <http://www.gnu.org/licenses/>. | |
* | |
*/ | |
package org.seo.rank.impl; | |
import org.apdplat.word.WordSegmenter; | |
import org.apdplat.word.segmentation.Word; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import org.seo.rank.SimilarChecker; | |
import org.seo.rank.list.DynamicIp; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import java.math.BigDecimal; | |
import java.util.*; | |
import java.util.concurrent.atomic.AtomicInteger; | |
/** | |
* ITEYE博文相似性检测 | |
* @author 杨尚川 | |
*/ | |
public class ITEYEBlogSimilarChecker implements SimilarChecker{ | |
private static final Logger LOGGER = LoggerFactory.getLogger(ITEYEBlogSimilarChecker.class); | |
private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; | |
private static final String ENCODING = "gzip, deflate"; | |
private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; | |
private static final String CONNECTION = "keep-alive"; | |
private static final String REFERER = "http://yangshangchuan.iteye.com"; | |
private static final String HOST = "yangshangchuan.iteye.com"; | |
private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; | |
private static final String BLOG_CSS_PATH = "html body div#page div#content.clearfix div#main div.blog_main"; | |
private static final String BLOG_TITLE_CSS_PATH = "div.blog_title"; | |
private static final String BLOG_CONTENT_CSS_PATH = "div#blog_content.blog_content"; | |
private static final float THRESHOLD_RATE = 0.8F; | |
@Override | |
public boolean isSimilar(String url1, String url2) { | |
return similarScore(url1, url2)>=THRESHOLD_RATE; | |
} | |
@Override | |
public double similarScore(String url1, String url2) { | |
Blog blog1 = getBlog(url1); | |
if(blog1!=null) { | |
Blog blog2 = getBlog(url2); | |
if(blog2!=null) { | |
double score = score(blog1, blog2); | |
//取两位小数 | |
score = (int)(score*100)/(double)100; | |
return score; | |
} | |
} | |
return 0; | |
} | |
private double score(Blog blog1, Blog blog2){ | |
//分词 | |
List<Word> blog1Words = WordSegmenter.seg(blog1.getTitle()+"\n"+blog1.getContent()); | |
List<Word> blog2Words = WordSegmenter.seg(blog2.getTitle()+"\n"+blog2.getContent()); | |
//词频统计 | |
Map<Word, AtomicInteger> blog1WordsFre = frequence(blog1Words); | |
Map<Word, AtomicInteger> blog2WordsFre = frequence(blog2Words); | |
//输出详细信息 | |
if(LOGGER.isDebugEnabled()){ | |
showDetail(blog1, blog1Words, blog1WordsFre); | |
showDetail(blog2, blog2Words, blog2WordsFre); | |
} | |
//使用简单共有词判定 | |
return simpleScore(blog1WordsFre, blog2WordsFre); | |
//使用余弦相似度判定 | |
//return cosScore(blog1WordsFre, blog2WordsFre); | |
} | |
/** | |
* 判定相似性的方式一:简单共有词 | |
* @param blog1WordsFre | |
* @param blog2WordsFre | |
* @return | |
*/ | |
private double simpleScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){ | |
//判断有几个相同的词 | |
AtomicInteger intersectionLength = new AtomicInteger(); | |
blog1WordsFre.keySet().forEach(word -> { | |
if (blog2WordsFre.keySet().contains(word)) { | |
intersectionLength.incrementAndGet(); | |
} | |
}); | |
LOGGER.info("网页1有的词数:" + blog1WordsFre.size()); | |
LOGGER.info("网页2有的词数:" + blog2WordsFre.size()); | |
LOGGER.info("网页1和2共有的词数:" + intersectionLength.get()); | |
double score = intersectionLength.get()/(double)Math.min(blog1WordsFre.size(), blog2WordsFre.size()); | |
LOGGER.info("相似度分值="+intersectionLength.get()+"/(double)Math.min("+blog1WordsFre.size()+", "+blog2WordsFre.size()+")="+score); | |
return score; | |
} | |
/** | |
* | |
* 判定相似性的方式二:余弦相似度 | |
* 余弦夹角原理: | |
* 向量a=(x1,y1),向量b=(x2,y2) | |
* a.b=x1x2+y1y2 | |
* |a|=根号[(x1)^2+(y1)^2],|b|=根号[(x2)^2+(y2)^2] | |
* a,b的夹角的余弦cos=a.b/|a|*|b|=(x1x2+y1y2)/根号[(x1)^2+(y1)^2]*根号[(x2)^2+(y2)^2] | |
* @param blog1WordsFre | |
* @param blog2WordsFre | |
*/ | |
private double cosScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){ | |
Set<Word> words = new HashSet<>(); | |
words.addAll(blog1WordsFre.keySet()); | |
words.addAll(blog2WordsFre.keySet()); | |
//向量的维度为words的大小,每一个维度的权重是词频,注意的是,中文分词的时候已经去了停用词 | |
//a.b | |
AtomicInteger ab = new AtomicInteger(); | |
//|a| | |
AtomicInteger aa = new AtomicInteger(); | |
//|b| | |
AtomicInteger bb = new AtomicInteger(); | |
//计算 | |
words | |
.stream() | |
.forEach(word -> { | |
AtomicInteger x1 = blog1WordsFre.get(word); | |
AtomicInteger x2 = blog2WordsFre.get(word); | |
if(x1!=null && x2!=null) { | |
//x1x2 | |
int oneOfTheDimension = x1.get() * x2.get(); | |
//+ | |
ab.addAndGet(oneOfTheDimension); | |
} | |
if(x1!=null){ | |
//(x1)^2 | |
int oneOfTheDimension = x1.get() * x1.get(); | |
//+ | |
aa.addAndGet(oneOfTheDimension); | |
} | |
if(x2!=null){ | |
//(x2)^2 | |
int oneOfTheDimension = x2.get() * x2.get(); | |
//+ | |
bb.addAndGet(oneOfTheDimension); | |
} | |
}); | |
double aaa = Math.sqrt(aa.get()); | |
double bbb = Math.sqrt(bb.get()); | |
//使用BigDecimal保证精确计算浮点数 | |
BigDecimal aabb = BigDecimal.valueOf(aaa).multiply(BigDecimal.valueOf(bbb)); | |
double cos = ab.get()/aabb.doubleValue(); | |
return cos; | |
} | |
private void showDetail(Blog blog, List<Word> blogWords, Map<Word, AtomicInteger> blogWordsFre){ | |
LOGGER.debug("博文URL:"); | |
LOGGER.debug("\t"+blog.getUrl()); | |
LOGGER.debug("博文标题:"); | |
LOGGER.debug("\t"+blog.getTitle()); | |
LOGGER.debug("博文内容:"); | |
LOGGER.debug("\t"+blog.getContent()); | |
LOGGER.debug("博文长度:"+blog.getContent().length()); | |
LOGGER.debug("博文分词结果:"); | |
LOGGER.debug("\t" + blogWords); | |
LOGGER.debug("博文词频统计:"); | |
AtomicInteger c = new AtomicInteger(); | |
blogWordsFre | |
.entrySet() | |
.stream() | |
.sorted((a,b)->b.getValue().get()-a.getValue().get()) | |
.forEach(e->LOGGER.debug("\t"+c.incrementAndGet()+"、"+e.getKey()+"="+e.getValue())); | |
} | |
private Map<Word, AtomicInteger> frequence(List<Word> words){ | |
Map<Word, AtomicInteger> fre =new HashMap<>(); | |
words.forEach(word->{ | |
fre.putIfAbsent(word, new AtomicInteger()); | |
fre.get(word).incrementAndGet(); | |
}); | |
return fre; | |
} | |
private Blog getBlog(String url) { | |
try { | |
String html = getHtml(url); | |
Document doc = Jsoup.parse(html); | |
Elements elements = doc.select(BLOG_CSS_PATH); | |
String title = null; | |
String content = null; | |
for(Element element : elements){ | |
Elements ts = element.select(BLOG_TITLE_CSS_PATH); | |
if(ts.size()==1){ | |
title = ts.get(0).text(); | |
} | |
ts = element.select(BLOG_CONTENT_CSS_PATH); | |
if(ts.size()==1){ | |
content = ts.get(0).text(); | |
} | |
} | |
if(title!=null && content!=null){ | |
Blog blog = new Blog(); | |
blog.setUrl(url); | |
blog.setTitle(title); | |
blog.setContent(content); | |
return blog; | |
} | |
} catch (Exception e) { | |
LOGGER.error("获取博文失败", e); | |
} | |
return null; | |
} | |
private String getHtml(String url){ | |
String html = getHtmlInternal(url); | |
int times = 1; | |
while (html==null && times<4){ | |
times++; | |
//使用新的IP地址 | |
DynamicIp.toNewIp(); | |
html = getHtmlInternal(url); | |
} | |
times = 1; | |
//LOGGER.debug("获取到的HTML:" +html); | |
while((html.contains("非常抱歉,来自您ip的请求异常频繁") | |
|| html.contains("请您点击按钮解除封锁") | |
|| html.contains("请输入以下验证码")) | |
&& times<4){ | |
times++; | |
//使用新的IP地址 | |
DynamicIp.toNewIp(); | |
html = getHtmlInternal(url); | |
} | |
return html; | |
} | |
private String getHtmlInternal(String url) { | |
try { | |
return Jsoup.connect(url) | |
.header("Accept", ACCEPT) | |
.header("Accept-Encoding", ENCODING) | |
.header("Accept-Language", LANGUAGE) | |
.header("Connection", CONNECTION) | |
.header("Referer", REFERER) | |
.header("Host", HOST) | |
.header("User-Agent", USER_AGENT) | |
.header("X-Forwarded-For", getRandomIp()) | |
.header("Proxy-Client-IP", getRandomIp()) | |
.header("WL-Proxy-Client-IP", getRandomIp()) | |
.ignoreContentType(true) | |
.timeout(30000) | |
.get().html(); | |
} catch (Exception e) { | |
LOGGER.error("获取博文失败", e); | |
} | |
return null; | |
} | |
private String getRandomIp(){ | |
int first = new Random().nextInt(254)+1; | |
//排除A类私有地址0.0.0.0--10.255.255.255 | |
while(first==10){ | |
first = new Random().nextInt(254)+1; | |
} | |
int second = new Random().nextInt(254)+1; | |
//排除B类私有地址172.16.0.0--172.31.255.255 | |
while(first==172 && (second>=16 && second<=31)){ | |
first = new Random().nextInt(254)+1; | |
second = new Random().nextInt(254)+1; | |
} | |
//排除C类私有地址192.168.0.0--192.168.255.255 | |
while(first==192 && second==168){ | |
first = new Random().nextInt(254)+1; | |
second = new Random().nextInt(254)+1; | |
} | |
int third = new Random().nextInt(254)+1; | |
int forth = new Random().nextInt(254)+1; | |
return first+"."+second+"."+second+"."+forth; | |
} | |
private static class Blog{ | |
private String url; | |
private String title; | |
private String content; | |
public String getUrl() { | |
return url; | |
} | |
public void setUrl(String url) { | |
this.url = url; | |
} | |
public String getTitle() { | |
return title; | |
} | |
public void setTitle(String title) { | |
this.title = title; | |
} | |
public String getContent() { | |
return content; | |
} | |
public void setContent(String content) { | |
this.content = content; | |
} | |
} | |
public static void main(String[] args) { | |
SimilarChecker similarChecker = new ITEYEBlogSimilarChecker(); | |
double score = similarChecker.similarScore("http://baidu-27233181.iteye.com/blog/2200707", | |
"http://baidu-27233181.iteye.com/blog/2200706"); | |
LOGGER.info("相似度分值:"+score); | |
} | |
} |
java多线程实现任务超时监听 - huangying2124的专栏 - 博客频道 - CSDN.NET
使用Future的特性(推荐)
利用Future.get(long timeout, TimeUnit unit)方法。
1、新建TaskThread类,实现Callable接口,实现call()方法。
2、线程池调用submit()方法,得到Future对象。
3、调用Future对象的get(long timeout, TimeUnit unit)方法,该方法的特点:阻塞式线程调用,同时指定了超时时间timeout,get方法执行超时会抛出timeout异常,该异常需要捕获。
示例代码:
- public class TimeTask implements Callable<String> {
- @Override
- public String call() throws Exception {
- //执行任务主体,简单示例
- Thread.sleep(1000);
- return "hehe";
- }
- }
- ExecutorService exec = Executors.newCachedThreadPool();
- Future<String> f = exec.submit(new TimeTask());
- try {
- f.get(200, TimeUnit.MILLISECONDS);
- } catch (InterruptedException e) {
- e.printStackTrace();
- } catch (ExecutionException e) {
- e.printStackTrace();
- } catch (TimeoutException e) {
- //定义超时后的状态修改
- System.out.println("thread time out");
- e.printStackTrace();
- }
Google Guava已经提供了TimeLimiter的功能,实现更精巧,功能更强大,可参考:
- import com.google.common.util.concurrent.SimpleTimeLimiter;
- public class TimeLimiterGoogle {
- /**
- * @param args
- * @throws Exception
- */
- public static void main(String[] args) throws Exception {
- // TODO Auto-generated method stub
- SimpleTimeLimiter st = new SimpleTimeLimiter();
- String r1 = st.callWithTimeout(new Callable<String>(){
- @Override
- public String call() throws Exception {
- return "Hello";
- }}, 10, TimeUnit.MILLISECONDS, true);
- System.out.println(r1);
- String r2 = st.callWithTimeout(new Callable<String>(){
- @Override
- public String call() throws Exception {
- Thread.sleep(1000);
- return "Hello";
- }}, 10, TimeUnit.MILLISECONDS, true);
- System.out.println(r2);
- }
- }