Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
courses:mapreduce-tutorial:step-24 [2012/01/27 20:56] straka |
courses:mapreduce-tutorial:step-24 [2012/01/31 16:25] (current) dusek |
||
---|---|---|---|
Line 1: | Line 1: | ||
- | ====== MapReduce Tutorial : Mappers, running Java Hadoop jobs ====== | + | ====== MapReduce Tutorial : Mappers, running Java Hadoop jobs, counters |
+ | |||
+ | We start by going through a simple Hadoop job with Mapper only. | ||
+ | |||
+ | A //mapper// which processes (key, value) pairs of types (Kin, Vin) and produces (key, value) pairs of types (Kout, Vout) must be a subclass of [[http:// | ||
+ | |||
+ | The mapper must define a '' | ||
+ | <code java> | ||
+ | public static class TheMapper extends Mapper< | ||
+ | public void setup(Context context) throws IOException, | ||
+ | |||
+ | public void map(Text key, Text value, Context context) throws IOException, | ||
+ | |||
+ | public void cleanup(Context context) throws IOException, | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | Outputting (key, value) pairs is performed using the [[http:// | ||
+ | |||
+ | Here is the source of the whole Hadoop job: | ||
- | We start by exploring a simple Hadoop job with Mapper only. The Mapper outputs only keys starting with '' | ||
<file java MapperOnlyHadoopJob.java> | <file java MapperOnlyHadoopJob.java> | ||
import java.io.IOException; | import java.io.IOException; | ||
Line 18: | Line 36: | ||
public void setup(Context context) { | public void setup(Context context) { | ||
} | } | ||
- | | + | |
public void map(Text key, Text value, Context context) throws IOException, | public void map(Text key, Text value, Context context) throws IOException, | ||
if (key.getLength() > 0 && Character.toUpperCase(key.charAt(0)) == ' | if (key.getLength() > 0 && Character.toUpperCase(key.charAt(0)) == ' | ||
Line 26: | Line 44: | ||
public void cleanup(Context context) { | public void cleanup(Context context) { | ||
- | } | + | } |
} | } | ||
- | | + | |
// Job configuration | // Job configuration | ||
public int run(String[] args) throws Exception { | public int run(String[] args) throws Exception { | ||
Line 35: | Line 53: | ||
return 1; | return 1; | ||
} | } | ||
- | | + | |
- | Job job = new Job(getConf(), | + | Job job = new Job(getConf(), |
- | + | // Name of the job is the name of current class. | |
- | job.setJarByClass(this.getClass()); | + | |
- | job.setMapperClass(TheMapper.class); | + | job.setJarByClass(this.getClass()); |
- | job.setOutputKeyClass(Text.class); | + | job.setMapperClass(TheMapper.class); |
- | job.setOutputValueClass(Text.class); | + | job.setOutputKeyClass(Text.class); |
- | + | job.setOutputValueClass(Text.class); | |
- | job.setInputFormatClass(KeyValueTextInputFormat.class); | + | |
- | + | job.setInputFormatClass(KeyValueTextInputFormat.class); | |
- | FileInputFormat.addInputPath(job, | + | // Output format is the default -- TextOutputFormat |
- | FileOutputFormat.setOutputPath(job, | + | |
- | + | FileInputFormat.addInputPath(job, | |
+ | FileOutputFormat.setOutputPath(job, | ||
return job.waitForCompletion(true) ? 0 : 1; | return job.waitForCompletion(true) ? 0 : 1; | ||
} | } | ||
Line 57: | Line 77: | ||
System.exit(res); | System.exit(res); | ||
} | } | ||
- | } | + | } |
</ | </ | ||
+ | |||
+ | ==== Remarks ==== | ||
+ | * The filename //must// be the same as the name of the top-level class -- this is enforced by Java compiler. But the top-level class can contain any number of nested classes. | ||
+ | * In one class multiple jobs can be submitted, either in sequence or in parallel. | ||
+ | * A mismatch of types is usually detected by the compiler, but sometimes it is detected only at runtime. If that happens, an exception is raised and the program crashes. For example, default key output class it '' | ||
+ | * **VIM users**: The code completion plugin does not complete the '' | ||
+ | |||
+ | ===== Running the job ===== | ||
+ | The official way of running Hadoop jobs is to use the ''/ | ||
+ | * '' | ||
+ | * '' | ||
+ | * '' | ||
+ | |||
+ | ===== Exercise 1 ===== | ||
+ | Download the '' | ||
+ | wget --no-check-certificate ' | ||
+ | make -f / | ||
+ | rm -rf step-24-out-sol; | ||
+ | less step-24-out-sol/ | ||
+ | |||
+ | Mind the '' | ||
+ | * When using '' | ||
+ | * When not specifying '' | ||
+ | |||
+ | ===== Counters ===== | ||
+ | |||
+ | As in the Perl API, a mapper (or a reducer) can increment various counters by using '' | ||
+ | <code java> | ||
+ | public void map(Text key, Text value, Context context) throws IOException, | ||
+ | ... | ||
+ | context.getCounter(" | ||
+ | ... | ||
+ | } | ||
+ | </ | ||
+ | The '' | ||
+ | <code java> | ||
+ | public void map(Text key, Text value, Context context) throws IOException, | ||
+ | ... | ||
+ | Counter words = context.getCounter(" | ||
+ | for (String word : value.toString().split(" | ||
+ | ... | ||
+ | words.increment(1); | ||
+ | } | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | ===== Example 2 ===== | ||
+ | |||
+ | Run a Hadoop job on / | ||
+ | |||
+ | wget --no-check-certificate ' | ||
+ | # NOW VIEW THE FILE | ||
+ | # $EDITOR ThreeLetterWords.java | ||
+ | make -f / | ||
+ | rm -rf step-24-out-sol; | ||
+ | less step-24-out-sol/ | ||
+ | |||
+ | ---- | ||
+ | |||
+ | < | ||
+ | <table style=" | ||
+ | <tr> | ||
+ | <td style=" | ||
+ | <td style=" | ||
+ | <td style=" | ||
+ | </tr> | ||
+ | </ | ||
+ | </ | ||
+ |