hadoop 处理不同的输入文件,文件关联
- - CSDN博客云计算推荐文章file1和file2进行关联,想要的结果:. 2、将file1的key、value颠倒 ;file1和file2的key相同,file1的value做key,file2的value做value ,输出. if("file1".equals(fileName)){//加标记. // 设置Map和Reduce处理类.
file1:
a 1
b 2
c 3
file2:
1 !
2 @
3 #
file1和file2进行关联,想要的结果:
a !
b @
3 #
1、标记不同输入文件
2、将file1的key、value颠倒 ;file1和file2的key相同,file1的value做key,file2的value做value ,输出。
package smiple; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class FileJoin { public static class MyMap extends Mapper<LongWritable , Text, Text, Text> { public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { // String line = value.toString(); String line=new String(value.getBytes(),0,value.getLength(),"GBK"); StringTokenizer tokenizer = new StringTokenizer(line); String keystr = tokenizer.nextToken(); String valuestr = tokenizer.nextToken(); //获取文件名 InputSplit inputSplit = context.getInputSplit(); String fileName = ((FileSplit) inputSplit).getPath().getName(); if("file1".equals(fileName)){//加标记 context.write(new Text(valuestr),new Text("file1_"+keystr)); }else if("file2".equals(fileName)){ context.write(new Text(keystr), new Text("file2_"+valuestr)); } } } public static class MyReduce extends Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException { Text resultKey = new Text("key0"); Text resultValue = new Text("value0"); for (Text val : values) { if("file1_".equals(val.toString().substring(0, 6))){ resultKey = new Text(val.toString().substring(6)); }else if("file2_".equals(val.toString().substring(0, 6))){ resultValue = new Text(val.toString().substring(6)); } } System.out.println(resultKey.toString()+" " + resultValue.toString()); context.write(resultKey, resultValue); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] ioArgs = new String[] { "hdfs://ip:port/mr/join/in","hdfs://ip:port/mr/join/out" }; String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: Data Sort <in> <out>"); System.exit(2); } Job job = new Job(conf, "file join "); job.setJarByClass(Sort.class); // 设置Map和Reduce处理类 job.setMapperClass(MyMap.class); job.setReducerClass(MyReduce.class); // 设置输出类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // 设置输入和输出目录 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }