[原]基于hadoop搜索引擎实践——二级索引文件(五)
- - long1657的专栏基于hadoop搜索引擎——二级索引文件. 一般生成的倒排表文件会比源文件暂用空间大,主要是倒排表文件所记录的信息比较详细. 它记录了所有的索引词记录(TERM_RECORD)信息,对于常见的关键词(TERM),其MULTI_INFO可能包含几万甚至几十万个SINGLE_INFO.. 由于倒排表文件很大.
public class SplitFilePartitioner<K, V> extends Partitioner<K, V> { public static int lastID = 0; @Override public int getPartition(K key, V value, int partitionNum) { this.lastID++; lastID = (this.lastID % partitionNum); return lastID; } }
public static class TokenIndexMapper extends Mapper<LongWritable, Text, Text, Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); FileSplit split = (FileSplit) context.getInputSplit(); context.write(new Text(line.split("\t")[0]), new Text(split .getPath().toString() + " " + key.get())); } } public static class TokenIndexReducer extends Reducer<Text, Text, Text, Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { boolean flag = true; while (values.iterator().hasNext()) { if (flag) { context.write(key, values.iterator().next()); flag = false; }else{ values.iterator().next(); } } } }