[原]基于hadoop搜索引擎实践——二级索引文件(五)
- - long1657的专栏基于hadoop搜索引擎——二级索引文件. 一般生成的倒排表文件会比源文件暂用空间大,主要是倒排表文件所记录的信息比较详细. 它记录了所有的索引词记录(TERM_RECORD)信息,对于常见的关键词(TERM),其MULTI_INFO可能包含几万甚至几十万个SINGLE_INFO.. 由于倒排表文件很大.
public class SplitFilePartitioner<K, V> extends Partitioner<K, V> {
public static int lastID = 0;
@Override
public int getPartition(K key, V value, int partitionNum) {
this.lastID++;
lastID = (this.lastID % partitionNum);
return lastID;
}
} public static class TokenIndexMapper extends
Mapper<LongWritable, Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
FileSplit split = (FileSplit) context.getInputSplit();
context.write(new Text(line.split("\t")[0]), new Text(split
.getPath().toString() + " " + key.get()));
}
}
public static class TokenIndexReducer extends
Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
boolean flag = true;
while (values.iterator().hasNext()) {
if (flag) {
context.write(key, values.iterator().next());
flag = false;
}else{
values.iterator().next();
}
}
}
}