tmp: Hadoop MapReduce開発環境

hadoop mapreduceの開発環境をつくる  
(といっても分散環境とかではなく、ただのjavaプログラムの開発環境)
(Mac OS X 10.8.5)

###Eclipseのダウンロード
* <http://www.eclipse.org/downloads/>から最新をダウンロード(Eclipse Standard 4.3.1)
* 解凍して配置(`/Applications/` 直下に置いた)

日本語化する場合は<http://fiahfy.blogspot.jp/2013/01/maceclipse.html>を参照してください  
バージョン違くても大体同じはず  
めんどくさいのでここでは省略

###hadoopのダウンロード
* [公式](http://hadoop.apache.org/)にあるのでここからダウンロード(`hadoop-0.23.9` をダウンロードした)
* 解凍して配置(eclipseのフォルダ `/Applications/eclipse/` 直下に置いた)

###サンプルプログラムで確認

####プロジェクトの作成と設定
* eclipseを起動して、プロジェクト作成(プロジェクト名 `WordCount` を作成)
* projectを右クリックして > *Build Path* > *Configure Build Path...*
* *Libraries* タブの *Add External JARs...* からhadoopフォルダ内の以下のjarファイルを全部追加する

```
hadoop-0.23.9/share/hadoop/common/*.jar
hadoop-0.23.9/share/hadoop/common/lib/*.jar
hadoop-0.23.9/share/hadoop/mapreduce/*.jar
hadoop-0.23.9/share/hadoop/mapreduce/lib/*.jar
hadoop-0.23.9/share/hadoop/yarn/*.jar
hadoop-0.23.9/share/hadoop/yarn/lib/*.jar
```

####サンプルプログラムを作成

基本的な出現した単語をカウントするプログラムを作成する
* *src* に以下のコードを `start.java` で作成

```java
import java.io.IOException;
import java.net.URI;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.IntWritable;
//import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class start extends Configured implements Tool {

public static void main(String[] args) throws Exception {
  int runner = ToolRunner.run(new start(), args);
  System.exit(runner);
 }

@Override
 public int run(String[] args) throws Exception {
  if (args.length != 2) {
   System.err.printf("Usage:%s [generic options] <input> <output>\n", getClass().getSimpleName());
   ToolRunner.printGenericCommandUsage(System.err);
   return -1;
  }
  String inputDir = args[0];
  String outputDir = args[1];
  
  Configuration conf = getConf();
  Job job = Job.getInstance(conf);
  job.setJarByClass(getClass());

// delete hdfs output files
  FileSystem fs = FileSystem.get(URI.create(outputDir), conf);
  fs.delete(new Path(outputDir), true);

FileInputFormat.setInputPaths(job, new Path(inputDir));
  FileOutputFormat.setOutputPath(job, new Path(outputDir));

job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(IntWritable.class);

job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);

job.setMapperClass(mapper.class);
//  job.setCombinerClass(reducer.class);
  job.setReducerClass(reducer.class);

job.setInputFormatClass(TextInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);

//  job.setNumMapTasks(1);
  job.setNumReduceTasks(1);

//  job.setPartitionerClass(HashPartitioner.class);

boolean result = job.waitForCompletion(true);

return result ? 0 : 1;
 }

public static class mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  @Override
  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
   String line = value.toString();
   String[] words = line.split("\\s");
   for (int i=0; i<words.length; i++)
   {
    String word = words[i];
    if (word.length() > 0) {
     context.write(new Text(words[i]), new IntWritable(1));
    }
   }
  }
 }

public static class reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
  @Override
  protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
   Integer count = 0;
   Iterator<IntWritable> iterator = values.iterator();
   while (iterator.hasNext())
   {
    count++;
    iterator.next();
   }
   context.write(key, new IntWritable(count));
  }
 }
}
```

####実行準備

* projectを右クリックして > *Run As* > *Run Configurations…*
* *Java Application*を選択して左上のファイルのアイコンで設定を追加、`Name` は適当に
* *Main* タブで *Main class:* に `start` と設定
* *Arguments* タブで `input output` と引数を2つ指定
* *Apply* して保存
* プロジェクトディレクトリ直下にinputとoutput用のディレクトリを作成する

```
WordCount/input
WordCount/output
```

* inputディレクトリにカウントしたい文章を保存したファイルを置く  
(ファイルの中身とファイル名は何でもいい `WordCount/input/input.txt`)

```
apple orange orange
grape orange
```

* 上記で作成した実行構成で実行
* 正常に実行されれば、outputディレクトリに結果が出力される  
(`WordCount/output/part-r-00000`)

```
apple 1
grape 1
orange 3
```

###参考
* <http://d.hatena.ne.jp/tetsuya_odaka/20130621/1371810727>
* <http://d.hatena.ne.jp/osaca_z4/201006>

tmp

Monday, November 18, 2013

Hadoop MapReduce開発環境

No comments:

Post a Comment