使用Lucene的MoreLikeThisQuery实现相关内容推荐
在分析MoreLikeThisQuery之前,首先介绍一下MoreLikeThis。
在实现搜索应用的时候,时常会遇到"更多相似文章","更多相关问题"之类的需求,也即根据当前文档的文本内容,在索引库中查询相类似的文章。
我们可以使用MoreLikeThis实现此功能:
|
IndexReader reader = IndexReader.open(……); IndexSearcher searcher = new IndexSearcher(reader); MoreLikeThis mlt = new MoreLikeThis(reader); Reader target = ... //此是一个io reader,指向当前文档的文本内容。 Query query = mlt.like( target); //根据当前的文本内容,生成查询对象。 Hits hits = searcher.search(query); //查询得到相似文档的结果。 |
MoreLikeThis的Query like(Reader r)函数如下:
|
public Query like(Reader r) throws IOException { return createQuery(retrieveTerms(r)); //其首先从当前文档的文本内容中抽取term,然后利用这些term构建一个查询对象。 } |
|
public PriorityQueue <Object[]> retrieveTerms(Reader r) throws IOException { Map<String,Int> words = new HashMap<String,Int>(); //根据不同的域中抽取term,到底根据哪些域抽取,可用函数void setFieldNames(String[] fieldNames)设定。 for (int i = 0; i < fieldNames.length; i++) { String fieldName = fieldNames[i]; addTermFrequencies(r, words, fieldName); } //将抽取的term放入优先级队列中 return createQueue(words); } |
|
private void addTermFrequencies(Reader r, Map<String,Int> termFreqMap, String fieldName) throws IOException { //首先对当前的文本进行分词,分词器可以由void setAnalyzer(Analyzer analyzer)设定。 TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount=0; TermAttribute termAtt = ts.addAttribute(TermAttribute.class); //遍历分好的每一个词 while (ts.incrementToken()) { String word = termAtt.term(); tokenCount++; //如果分词后的term的数量超过某个设定的值,则停止,可由void setMaxNumTokensParsed(int i)设定。 if(tokenCount>maxNumTokensParsed) { break; } //如果此词小于最小长度,或者大于最大长度,或者属于停词,则属于干扰词。 //最小长度由void setMinWordLen(int minWordLen)设定。 //最大长度由void setMaxWordLen(int maxWordLen)设定。 //停词表由void setStopWords(Set<?> stopWords)设定。 if(isNoiseWord(word)){ continue; } // 统计词频tf Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } } |
|
private PriorityQueue createQueue(Map<String,Int> words) throws IOException { //根据统计的term及词频构造优先级队列。 int numDocs = ir.numDocs(); FreqQ res = new FreqQ(words.size()); // 优先级队列,将按tf*idf排序 Iterator<String> it = words.keySet().iterator(); //遍历每一个词 while (it.hasNext()) { String word = it.next(); int tf = words.get(word).x; //如果词频小于最小词频,则忽略此词,最小词频可由void setMinTermFreq(int minTermFreq)设定。 if (minTermFreq > 0 && tf < minTermFreq) { continue; } //遍历所有域,得到包含当前词,并且拥有最大的doc frequency的域 String topField = fieldNames[0]; int docFreq = 0; for (int i = 0; i < fieldNames.length; i++) { int freq = ir.docFreq(new Term(fieldNames[i], word)); topField = (freq > docFreq) ? fieldNames[i] : topField; docFreq = (freq > docFreq) ? freq : docFreq; } //如果文档频率小于最小文档频率,则忽略此词。最小文档频率可由void setMinDocFreq(int minDocFreq)设定。 if (minDocFreq > 0 && docFreq < minDocFreq) { continue; } //如果文档频率大于最大文档频率,则忽略此词。最大文档频率可由void setMaxDocFreq(int maxFreq)设定。 if (docFreq > maxDocFreq) { continue; } if (docFreq == 0) { continue; } //计算打分tf*idf float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; //将object的数组放入优先级队列,只有前三项有用,按照第三项score排序。 res.insertWithOverflow(new Object[]{word, // 词 topField, // 域 Float.valueOf(score), // 打分 Float.valueOf(idf), // idf Integer.valueOf(docFreq), // 文档频率 Integer.valueOf(tf) //词频 }); } return res; } |
|
private Query createQuery(PriorityQueue q) { //最后生成的是一个布尔查询 BooleanQuery query = new BooleanQuery(); Object cur; int qterms = 0; float bestScore = 0; //不断从队列中优先取出打分最高的词 while (((cur = q.pop()) != null)) { Object[] ar = (Object[]) cur; TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); if (boost) { if (qterms == 0) { //第一个词的打分最高,作为bestScore bestScore = ((Float) ar[2]).floatValue(); } float myScore = ((Float) ar[2]).floatValue(); //其他的词的打分除以最高打分,乘以boostFactor,得到相应的词所生成的查询的boost,从而在当前文本文档中打分越高的词在查询语句中也有更高的boost,起重要的作用。 tq.setBoost(boostFactor * myScore / bestScore); } try { query.add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses ignore) { break; } qterms++; //如果超过了设定的最大的查询词的数目,则停止,最大查询词的数目可由void setMaxQueryTerms(int maxQueryTerms)设定。 if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { break; } } return query; } |
MoreLikeThisQuery只是MoreLikeThis的封装,其包含了MoreLikeThis所需要的参数,并在rewrite的时候,由MoreLikeThis.like生成查询对象。
- String likeText;当前文档的文本
- String[] moreLikeFields;根据哪个域来抽取查询词
- Analyzer analyzer;分词器
- float percentTermsToMatch=0.3f;最后生成的BooleanQuery之间都是SHOULD的关系,其中至少有多少比例必须得到满足
- int minTermFrequency=1;最少的词频
- int maxQueryTerms=5;最多的查询词数目
- Set<?> stopWords=null;停词表
- int minDocFreq=-1;最小的文档频率
|
public Query rewrite(IndexReader reader) throws IOException { MoreLikeThis mlt=new MoreLikeThis(reader); mlt.setFieldNames(moreLikeFields); mlt.setAnalyzer(analyzer); mlt.setMinTermFreq(minTermFrequency); if(minDocFreq>=0) { mlt.setMinDocFreq(minDocFreq); } mlt.setMaxQueryTerms(maxQueryTerms); mlt.setStopWords(stopWords); BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes())); BooleanClause[] clauses = bq.getClauses(); bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch)); return bq; } |
举例,对于http://topic.csdn.net/u/20100501/09/64e41f24-e69a-40e3-9058-17487e4f311b.html?1469中的帖子

我们姑且将相关问题中的帖子以及其他共20篇文档索引。
|
File indexDir = new File("TestMoreLikeThisQuery/index"); IndexReader reader = IndexReader.open(indexDir); IndexSearcher searcher = new IndexSearcher(reader); //将《IT外企那点儿事》作为likeText,从文件读入。 StringBuffer contentBuffer = new StringBuffer(); BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream("TestMoreLikeThisQuery/IT外企那点儿事.txt"), "utf-8")); String line = null; while((line = input.readLine()) != null){ contentBuffer.append(line); } String content = contentBuffer.toString(); //分词用中科院分词 MoreLikeThisQuery query = new MoreLikeThisQuery(content, new String[]{"contents"}, new MyAnalyzer(new ChineseAnalyzer())); //将80%都包括的词作为停词,在实际应用中,可以有其他的停词策略。 query.setStopWords(getStopWords(reader)); //至少包含5个的词才认为是重要的 query.setMinTermFrequency(5); //只取其中之一 query.setMaxQueryTerms(1); TopDocs docs = searcher.search(query, 50); for (ScoreDoc doc : docs.scoreDocs) { Document ldoc = reader.document(doc.doc); String title = ldoc.get("title"); System.out.println(title); } |
|
static Set<String> getStopWords(IndexReader reader) throws IOException{ HashSet<String> stop = new HashSet<String>(); int numOfDocs = reader.numDocs(); int stopThreshhold = (int) (numOfDocs*0.7f); TermEnum te = reader.terms(); while(te.next()){ String text = te.term().text(); if(te.docFreq() >= stopThreshhold){ stop.add(text); } } return stop; } |
|
结果为: 揭开外企的底儿(连载六)——外企招聘也有潜规则.txt 去央企还是外企,帮忙分析下.txt 哪种英语教材比较适合英语基础差的人.txt 有在达内外企软件工程师就业班培训过的吗.txt 两个月的“骑驴找马”,面试无数家公司的深圳体验.txt 一个看了可能改变你一生的小说《做单》,外企销售经理做单技巧大揭密.txt HR的至高机密:20个公司绝对不会告诉你的潜规则.txt |
Lucene过滤器 - baobeituping - ITeye技术网站
有的应用有些要求,对于某类型的内容即使满足条件了,但是也不能被搜索出来,lucene中提供了过滤器的功能,通过自定义的过滤器继承Filter,从而实现特定的过滤功能。
Filter是一种过滤行为BitSet是一种位集合队列,这个队列中只有两种取值,TRUE或FALSE,LUCENE以这两种取值代表文档是否被过滤,也就是说,LUCENE返回结果时,会首先遍历BITSET,仅将那些对应值为TRUE的文档返回。
过滤器:
package com.filter;
import java.io.IOException;
import java.util.BitSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.Filter;
public class AdvancedSecurityFilter extends Filter {
//安全级别的常量
public static final int ADVANCED=0;
@Override
public BitSet bits(IndexReader reader) throws IOException {
//首先初始化一个BITSET对象
final BitSet bits = new BitSet(reader.maxDoc());
//先将整个集合设置为TRUE,表示当前集合内的所有文档都是可以被检索到的。
bits.set(0,bits.size()-1);
//构造一个TERM对象,代表最高安全级别
Term term = new Term("securitylevel",ADVANCED+"");
//从索引中搜索出所有最高安全级别的文档
TermDocs termDocs = reader.termDocs(term);
//遍历每个文档,并将其
while(termDocs.next())
{
bits.set(termDocs.doc(),false);
}
return bits;
}
}
过滤器使用实例:
public class FilterDemo {
/**
* @param args
*/
public static final int ADVANCED=0;
public static final int MIDDLE =1;
public static final int NORMAL=2;
public static void main(String[] args) {
try {
/*File file = new File("d://demo");
Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter(file, luceneAnalyzer, false);
Document doc1 = new Document();
Field f1 = new Field("bookNumber","0003",Field.Store.YES,Field.Index.UN_TOKENIZED);
Field f2 = new Field("bookName","非对称模型",Field.Store.YES,Field.Index.UN_TOKENIZED);
Field f3 = new Field("securitylevel",ADVANCED+"",Field.Store.YES,Field.Index.UN_TOKENIZED);
doc1.add(f1);
doc1.add(f2);
doc1.add(f3);
Document doc2 = new Document();
Field f4 = new Field("bookNumber","0001",Field.Store.YES,Field.Index.UN_TOKENIZED);
Field f5 = new Field("bookName","钢铁战士",Field.Store.YES,Field.Index.TOKENIZED);
Field f6 = new Field("securitylevel",MIDDLE+"",Field.Store.YES,Field.Index.UN_TOKENIZED);
doc2.add(f4);
doc2.add(f5);
doc2.add(f6);
Document doc3 = new Document();
Field f7 = new Field("bookNumber","0004",Field.Store.YES,Field.Index.UN_TOKENIZED);
Field f8 = new Field("bookName","黑猫警长",Field.Store.YES,Field.Index.TOKENIZED);
Field f9 = new Field("securitylevel",NORMAL+"",Field.Store.YES,Field.Index.UN_TOKENIZED);
doc3.add(f7);
doc3.add(f8);
doc3.add(f9);
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.addDocument(doc3);
writer.setUseCompoundFile(true);
writer.optimize();
writer.close();*/
Term begin = new Term("bookNumber","0001");
Term end = new Term("bookNumber","0004");
RangeQuery q = new RangeQuery(begin,end,true);
IndexSearcher searcher = new IndexSearcher("d://demo");
System.out.println(q.toString());
//通过将自定义的过滤器配置在search方法中,从而达到过滤的目的。
Hits hits = searcher.search(q,new AdvancedSecurityFilter());
for(int i=0;i<hits.length();i++)
{
System.out.println(hits.doc(i));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
rank/ITEYEBlogSimilarChecker.java at master · ysc/rank · GitHub
我们如何应对这样的商业广告呢?基本思路如下:
1、当管理员发现一篇博文为黑博文时,人工确认。
2、将人工确认的黑博文保存到黑博文数据库。
3、当有新博文发表时,和黑博文数据库进行相似度计算,如果相似度超过预设的阈值,则拒绝发表博文。
下面是黑博文判断程序的详细判断过程,先上最终结果:
判定相似性的方式一:简单共有词
阈值=Math.min(339, 340)*0.8=271.2
待发表博文和黑博文共有的词数:339
因为待发表博文和黑博文共有的词数339 大于 阈值:271.2
所以判断为 相似 ,拒绝发表!
判定相似性的方式二:余弦相似度
待发表博文和黑博文的余弦夹角值:0.9977658868305056
因为待发表博文和黑博文的余弦夹角值0.9977658868305056大于或等于阈值:0.8
所以判断为 相似 ,拒绝发表!
| /** | |
| * | |
| * APDPlat - Application Product Development Platform | |
| * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com | |
| * | |
| * This program is free software: you can redistribute it and/or modify | |
| * it under the terms of the GNU General Public License as published by | |
| * the Free Software Foundation, either version 3 of the License, or | |
| * (at your option) any later version. | |
| * | |
| * This program is distributed in the hope that it will be useful, | |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| * GNU General Public License for more details. | |
| * | |
| * You should have received a copy of the GNU General Public License | |
| * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| * | |
| */ | |
| package org.seo.rank.impl; | |
| import org.apdplat.word.WordSegmenter; | |
| import org.apdplat.word.segmentation.Word; | |
| import org.jsoup.Jsoup; | |
| import org.jsoup.nodes.Document; | |
| import org.jsoup.nodes.Element; | |
| import org.jsoup.select.Elements; | |
| import org.seo.rank.SimilarChecker; | |
| import org.seo.rank.list.DynamicIp; | |
| import org.slf4j.Logger; | |
| import org.slf4j.LoggerFactory; | |
| import java.math.BigDecimal; | |
| import java.util.*; | |
| import java.util.concurrent.atomic.AtomicInteger; | |
| /** | |
| * ITEYE博文相似性检测 | |
| * @author 杨尚川 | |
| */ | |
| public class ITEYEBlogSimilarChecker implements SimilarChecker{ | |
| private static final Logger LOGGER = LoggerFactory.getLogger(ITEYEBlogSimilarChecker.class); | |
| private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; | |
| private static final String ENCODING = "gzip, deflate"; | |
| private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; | |
| private static final String CONNECTION = "keep-alive"; | |
| private static final String REFERER = "http://yangshangchuan.iteye.com"; | |
| private static final String HOST = "yangshangchuan.iteye.com"; | |
| private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; | |
| private static final String BLOG_CSS_PATH = "html body div#page div#content.clearfix div#main div.blog_main"; | |
| private static final String BLOG_TITLE_CSS_PATH = "div.blog_title"; | |
| private static final String BLOG_CONTENT_CSS_PATH = "div#blog_content.blog_content"; | |
| private static final float THRESHOLD_RATE = 0.8F; | |
| @Override | |
| public boolean isSimilar(String url1, String url2) { | |
| return similarScore(url1, url2)>=THRESHOLD_RATE; | |
| } | |
| @Override | |
| public double similarScore(String url1, String url2) { | |
| Blog blog1 = getBlog(url1); | |
| if(blog1!=null) { | |
| Blog blog2 = getBlog(url2); | |
| if(blog2!=null) { | |
| double score = score(blog1, blog2); | |
| //取两位小数 | |
| score = (int)(score*100)/(double)100; | |
| return score; | |
| } | |
| } | |
| return 0; | |
| } | |
| private double score(Blog blog1, Blog blog2){ | |
| //分词 | |
| List<Word> blog1Words = WordSegmenter.seg(blog1.getTitle()+"\n"+blog1.getContent()); | |
| List<Word> blog2Words = WordSegmenter.seg(blog2.getTitle()+"\n"+blog2.getContent()); | |
| //词频统计 | |
| Map<Word, AtomicInteger> blog1WordsFre = frequence(blog1Words); | |
| Map<Word, AtomicInteger> blog2WordsFre = frequence(blog2Words); | |
| //输出详细信息 | |
| if(LOGGER.isDebugEnabled()){ | |
| showDetail(blog1, blog1Words, blog1WordsFre); | |
| showDetail(blog2, blog2Words, blog2WordsFre); | |
| } | |
| //使用简单共有词判定 | |
| return simpleScore(blog1WordsFre, blog2WordsFre); | |
| //使用余弦相似度判定 | |
| //return cosScore(blog1WordsFre, blog2WordsFre); | |
| } | |
| /** | |
| * 判定相似性的方式一:简单共有词 | |
| * @param blog1WordsFre | |
| * @param blog2WordsFre | |
| * @return | |
| */ | |
| private double simpleScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){ | |
| //判断有几个相同的词 | |
| AtomicInteger intersectionLength = new AtomicInteger(); | |
| blog1WordsFre.keySet().forEach(word -> { | |
| if (blog2WordsFre.keySet().contains(word)) { | |
| intersectionLength.incrementAndGet(); | |
| } | |
| }); | |
| LOGGER.info("网页1有的词数:" + blog1WordsFre.size()); | |
| LOGGER.info("网页2有的词数:" + blog2WordsFre.size()); | |
| LOGGER.info("网页1和2共有的词数:" + intersectionLength.get()); | |
| double score = intersectionLength.get()/(double)Math.min(blog1WordsFre.size(), blog2WordsFre.size()); | |
| LOGGER.info("相似度分值="+intersectionLength.get()+"/(double)Math.min("+blog1WordsFre.size()+", "+blog2WordsFre.size()+")="+score); | |
| return score; | |
| } | |
| /** | |
| * | |
| * 判定相似性的方式二:余弦相似度 | |
| * 余弦夹角原理: | |
| * 向量a=(x1,y1),向量b=(x2,y2) | |
| * a.b=x1x2+y1y2 | |
| * |a|=根号[(x1)^2+(y1)^2],|b|=根号[(x2)^2+(y2)^2] | |
| * a,b的夹角的余弦cos=a.b/|a|*|b|=(x1x2+y1y2)/根号[(x1)^2+(y1)^2]*根号[(x2)^2+(y2)^2] | |
| * @param blog1WordsFre | |
| * @param blog2WordsFre | |
| */ | |
| private double cosScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){ | |
| Set<Word> words = new HashSet<>(); | |
| words.addAll(blog1WordsFre.keySet()); | |
| words.addAll(blog2WordsFre.keySet()); | |
| //向量的维度为words的大小,每一个维度的权重是词频,注意的是,中文分词的时候已经去了停用词 | |
| //a.b | |
| AtomicInteger ab = new AtomicInteger(); | |
| //|a| | |
| AtomicInteger aa = new AtomicInteger(); | |
| //|b| | |
| AtomicInteger bb = new AtomicInteger(); | |
| //计算 | |
| words | |
| .stream() | |
| .forEach(word -> { | |
| AtomicInteger x1 = blog1WordsFre.get(word); | |
| AtomicInteger x2 = blog2WordsFre.get(word); | |
| if(x1!=null && x2!=null) { | |
| //x1x2 | |
| int oneOfTheDimension = x1.get() * x2.get(); | |
| //+ | |
| ab.addAndGet(oneOfTheDimension); | |
| } | |
| if(x1!=null){ | |
| //(x1)^2 | |
| int oneOfTheDimension = x1.get() * x1.get(); | |
| //+ | |
| aa.addAndGet(oneOfTheDimension); | |
| } | |
| if(x2!=null){ | |
| //(x2)^2 | |
| int oneOfTheDimension = x2.get() * x2.get(); | |
| //+ | |
| bb.addAndGet(oneOfTheDimension); | |
| } | |
| }); | |
| double aaa = Math.sqrt(aa.get()); | |
| double bbb = Math.sqrt(bb.get()); | |
| //使用BigDecimal保证精确计算浮点数 | |
| BigDecimal aabb = BigDecimal.valueOf(aaa).multiply(BigDecimal.valueOf(bbb)); | |
| double cos = ab.get()/aabb.doubleValue(); | |
| return cos; | |
| } | |
| private void showDetail(Blog blog, List<Word> blogWords, Map<Word, AtomicInteger> blogWordsFre){ | |
| LOGGER.debug("博文URL:"); | |
| LOGGER.debug("\t"+blog.getUrl()); | |
| LOGGER.debug("博文标题:"); | |
| LOGGER.debug("\t"+blog.getTitle()); | |
| LOGGER.debug("博文内容:"); | |
| LOGGER.debug("\t"+blog.getContent()); | |
| LOGGER.debug("博文长度:"+blog.getContent().length()); | |
| LOGGER.debug("博文分词结果:"); | |
| LOGGER.debug("\t" + blogWords); | |
| LOGGER.debug("博文词频统计:"); | |
| AtomicInteger c = new AtomicInteger(); | |
| blogWordsFre | |
| .entrySet() | |
| .stream() | |
| .sorted((a,b)->b.getValue().get()-a.getValue().get()) | |
| .forEach(e->LOGGER.debug("\t"+c.incrementAndGet()+"、"+e.getKey()+"="+e.getValue())); | |
| } | |
| private Map<Word, AtomicInteger> frequence(List<Word> words){ | |
| Map<Word, AtomicInteger> fre =new HashMap<>(); | |
| words.forEach(word->{ | |
| fre.putIfAbsent(word, new AtomicInteger()); | |
| fre.get(word).incrementAndGet(); | |
| }); | |
| return fre; | |
| } | |
| private Blog getBlog(String url) { | |
| try { | |
| String html = getHtml(url); | |
| Document doc = Jsoup.parse(html); | |
| Elements elements = doc.select(BLOG_CSS_PATH); | |
| String title = null; | |
| String content = null; | |
| for(Element element : elements){ | |
| Elements ts = element.select(BLOG_TITLE_CSS_PATH); | |
| if(ts.size()==1){ | |
| title = ts.get(0).text(); | |
| } | |
| ts = element.select(BLOG_CONTENT_CSS_PATH); | |
| if(ts.size()==1){ | |
| content = ts.get(0).text(); | |
| } | |
| } | |
| if(title!=null && content!=null){ | |
| Blog blog = new Blog(); | |
| blog.setUrl(url); | |
| blog.setTitle(title); | |
| blog.setContent(content); | |
| return blog; | |
| } | |
| } catch (Exception e) { | |
| LOGGER.error("获取博文失败", e); | |
| } | |
| return null; | |
| } | |
| private String getHtml(String url){ | |
| String html = getHtmlInternal(url); | |
| int times = 1; | |
| while (html==null && times<4){ | |
| times++; | |
| //使用新的IP地址 | |
| DynamicIp.toNewIp(); | |
| html = getHtmlInternal(url); | |
| } | |
| times = 1; | |
| //LOGGER.debug("获取到的HTML:" +html); | |
| while((html.contains("非常抱歉,来自您ip的请求异常频繁") | |
| || html.contains("请您点击按钮解除封锁") | |
| || html.contains("请输入以下验证码")) | |
| && times<4){ | |
| times++; | |
| //使用新的IP地址 | |
| DynamicIp.toNewIp(); | |
| html = getHtmlInternal(url); | |
| } | |
| return html; | |
| } | |
| private String getHtmlInternal(String url) { | |
| try { | |
| return Jsoup.connect(url) | |
| .header("Accept", ACCEPT) | |
| .header("Accept-Encoding", ENCODING) | |
| .header("Accept-Language", LANGUAGE) | |
| .header("Connection", CONNECTION) | |
| .header("Referer", REFERER) | |
| .header("Host", HOST) | |
| .header("User-Agent", USER_AGENT) | |
| .header("X-Forwarded-For", getRandomIp()) | |
| .header("Proxy-Client-IP", getRandomIp()) | |
| .header("WL-Proxy-Client-IP", getRandomIp()) | |
| .ignoreContentType(true) | |
| .timeout(30000) | |
| .get().html(); | |
| } catch (Exception e) { | |
| LOGGER.error("获取博文失败", e); | |
| } | |
| return null; | |
| } | |
| private String getRandomIp(){ | |
| int first = new Random().nextInt(254)+1; | |
| //排除A类私有地址0.0.0.0--10.255.255.255 | |
| while(first==10){ | |
| first = new Random().nextInt(254)+1; | |
| } | |
| int second = new Random().nextInt(254)+1; | |
| //排除B类私有地址172.16.0.0--172.31.255.255 | |
| while(first==172 && (second>=16 && second<=31)){ | |
| first = new Random().nextInt(254)+1; | |
| second = new Random().nextInt(254)+1; | |
| } | |
| //排除C类私有地址192.168.0.0--192.168.255.255 | |
| while(first==192 && second==168){ | |
| first = new Random().nextInt(254)+1; | |
| second = new Random().nextInt(254)+1; | |
| } | |
| int third = new Random().nextInt(254)+1; | |
| int forth = new Random().nextInt(254)+1; | |
| return first+"."+second+"."+second+"."+forth; | |
| } | |
| private static class Blog{ | |
| private String url; | |
| private String title; | |
| private String content; | |
| public String getUrl() { | |
| return url; | |
| } | |
| public void setUrl(String url) { | |
| this.url = url; | |
| } | |
| public String getTitle() { | |
| return title; | |
| } | |
| public void setTitle(String title) { | |
| this.title = title; | |
| } | |
| public String getContent() { | |
| return content; | |
| } | |
| public void setContent(String content) { | |
| this.content = content; | |
| } | |
| } | |
| public static void main(String[] args) { | |
| SimilarChecker similarChecker = new ITEYEBlogSimilarChecker(); | |
| double score = similarChecker.similarScore("http://baidu-27233181.iteye.com/blog/2200707", | |
| "http://baidu-27233181.iteye.com/blog/2200706"); | |
| LOGGER.info("相似度分值:"+score); | |
| } | |
| } |
java多线程实现任务超时监听 - huangying2124的专栏 - 博客频道 - CSDN.NET
使用Future的特性(推荐)
利用Future.get(long timeout, TimeUnit unit)方法。
1、新建TaskThread类,实现Callable接口,实现call()方法。
2、线程池调用submit()方法,得到Future对象。
3、调用Future对象的get(long timeout, TimeUnit unit)方法,该方法的特点:阻塞式线程调用,同时指定了超时时间timeout,get方法执行超时会抛出timeout异常,该异常需要捕获。
示例代码:
- public class TimeTask implements Callable<String> {
- @Override
- public String call() throws Exception {
- //执行任务主体,简单示例
- Thread.sleep(1000);
- return "hehe";
- }
- }
- ExecutorService exec = Executors.newCachedThreadPool();
- Future<String> f = exec.submit(new TimeTask());
- try {
- f.get(200, TimeUnit.MILLISECONDS);
- } catch (InterruptedException e) {
- e.printStackTrace();
- } catch (ExecutionException e) {
- e.printStackTrace();
- } catch (TimeoutException e) {
- //定义超时后的状态修改
- System.out.println("thread time out");
- e.printStackTrace();
- }
Google Guava已经提供了TimeLimiter的功能,实现更精巧,功能更强大,可参考:
- import com.google.common.util.concurrent.SimpleTimeLimiter;
- public class TimeLimiterGoogle {
- /**
- * @param args
- * @throws Exception
- */
- public static void main(String[] args) throws Exception {
- // TODO Auto-generated method stub
- SimpleTimeLimiter st = new SimpleTimeLimiter();
- String r1 = st.callWithTimeout(new Callable<String>(){
- @Override
- public String call() throws Exception {
- return "Hello";
- }}, 10, TimeUnit.MILLISECONDS, true);
- System.out.println(r1);
- String r2 = st.callWithTimeout(new Callable<String>(){
- @Override
- public String call() throws Exception {
- Thread.sleep(1000);
- return "Hello";
- }}, 10, TimeUnit.MILLISECONDS, true);
- System.out.println(r2);
- }
- }
Word Break II -- LeetCode - Code Ganker - 博客频道 - CSDN.NET
原题链接: http://oj.leetcode.com/problems/word-break-ii/
这道题目要求跟Word Break比较类似,不过返回的结果不仅要知道能不能break,如果可以还要返回所有合法结果。一般来说这种要求会让动态规划的效果减弱很多,因为我们要在过程中记录下所有的合法结果,中间的操作会使得算法的复杂度不再是动态规划的两层循环,因为每次迭代中还需要不是constant的操作,最终复杂度会主要取决于结果的数量,而且还会占用大量的空间,因为不仅要保存最终结果,包括中间的合法结果也要一一保存,否则后面需要历史信息会取不到。所以这道题目我们介绍两种方法,一种是直接brute force用递归解,另一种是跟Word Break思路类似的动态规划。
对于brute force解法,代码比较简单,每次维护一个当前结果集,然后遍历剩下的所有子串,如果子串在字典中出现,则保存一下结果,并放入下一层递归剩下的字符。思路接近于我们在N-Queens这些NP问题中经常用到的套路。代码如下:
- public ArrayList<String> wordBreak(String s, Set<String> dict) {
- ArrayList<String> res = new ArrayList<String>();
- if(s==null || s.length()==0)
- return res;
- helper(s,dict,0,"",res);
- return res;
- }
- private void helper(String s, Set<String> dict, int start, String item, ArrayList<String> res)
- {
- if(start>=s.length())
- {
- res.add(item);
- return;
- }
- StringBuilder str = new StringBuilder();
- for(int i=start;i<s.length();i++)
- {
- str.append(s.charAt(i));
- if(dict.contains(str.toString()))
- {
- String newItem = item.length()>0?(item+" "+str.toString()):str.toString();
- helper(s,dict,i+1,newItem,res);
- }
- }
- }
接下来我们列出动态规划的解法,递推式跟Word Break是一样的,只是现在不只要保存true或者false,还需要保存true的时候所有合法的组合,以便在未来需要的时候可以得到。不过为了实现这点,代码量就增大很多,需要一个数据结构来进行存储,同时空间复杂度变得非常高,因为所有中间组合都要一直保存。时间上还是有提高的,就是当我们需要前面到某一个元素前的所有合法组合时,我们不需要重新计算了。不过最终复杂度还是主要取决于结果的数量。代码如下:
- class Interval {
- int start;
- int end;
- public Interval(int start, int end) {
- this.start = start; this.end = end;
- }
- public Interval(Interval i) {
- start = i.start;
- end = i.end;
- }
- }
- ArrayList<ArrayList<Interval>> deepCopy(ArrayList<ArrayList<Interval>> paths) {
- if (paths==null) return null;
- ArrayList<ArrayList<Interval>> res = new ArrayList<ArrayList<Interval>>(paths.size());
- for (int i=0; i<paths.size(); i++) {
- ArrayList<Interval> path = paths.get(i);
- ArrayList<Interval> copy = new ArrayList<Interval>(path.size());
- for (int j=0; j<path.size(); j++) {
- copy.add(new Interval(path.get(j)));
- }
- res.add(copy);
- }
- return res;
- }
- public ArrayList<String> wordBreak(String s, Set<String> dict) {
- ArrayList<String> res = new ArrayList<String>();
- if (s==null || s.length()==0) return res;
- Map<Integer, ArrayList<ArrayList<Interval>>> dp = new HashMap<Integer, ArrayList<ArrayList<Interval>>>();
- dp.put(0, new ArrayList<ArrayList<Interval>>());
- dp.get(0).add(new ArrayList<Interval>());
- for (int i=1; i<=s.length(); i++) {
- for (int j=0; j<i; j++) {
- String cur = s.substring(j, i);
- ArrayList<ArrayList<Interval>> paths = null;
- if (dp.containsKey(j) && dict.contains(cur)) {
- paths = deepCopy(dp.get(j));
- Interval interval = new Interval(j, i);
- for (ArrayList<Interval> path:paths) {
- path.add(interval);
- }
- }
- if (paths != null) {
- if (!dp.containsKey(i)) {
- dp.put(i, paths);
- }
- else {
- dp.get(i).addAll(paths);
- }
- }
- }
- }
- if (!dp.containsKey(s.length())) {
- return res;
- }
- ArrayList<ArrayList<Interval>> paths = dp.get(s.length());
- for (int j=0; j<paths.size(); j++) {
- ArrayList<Interval> path = paths.get(j);
- StringBuilder str = new StringBuilder();
- for (int i=0; i<path.size(); i++) {
- if (i!=0) {str.append(" ");}
- int start = path.get(i).start;
- int end = path.get(i).end;
- str.append(s.substring(start, end));
- }
- res.add(str.toString());
- }
- return res;
- }
可以看出,用动态规划的代码复杂度要远远高于brute force的解法,而且本质来说并没有很大的提高,甚至空间上还是一个暴涨的情况。所以这道题来说并不是一定要用动态规划,有一个朋友在面Amazon时遇到这道题,面试官并没有要求动态规划,用brute force即可,不过两种方法时间上和空间上的优劣还是想清楚比较好,面试官可能想听听理解。实现的话可能主要是递归解法。
还有一点需要指出的是,上面的两个代码放到LeetCode中都会超时,原因是LeetCode中有一个非常tricky的测试case,其实是不能break的,但是又很长,出现大量的记录和回溯,因此一般通过这个的解法是把Word Break先跑一遍,判断是不是能break,如果可以再跑上面的代码。这样做法其实比较傻,但是为了过这个只能这样了,这一点我觉得LeetCode没必要把超时设得这么严格,实际意义不大,只是把AC率给拉了下来哈。
