This is an old revision of the document!
Table of Contents
MapReduce Tutorial - Perl API
Hadoop::Runner
package Hadoop::Runner; use Moose; has 'mapper' => (does => 'Hadoop::Mapper', required => 1); has 'reducer' => (does => 'Hadoop::Reducer'); has 'combiner' => (does => 'Hadoop::Reducer'); has 'partitioner' => (does => 'Hadoop::Partitioner'); has 'input_format' => (isa => 'InputFormat', default => 'TextInputFormat'); has 'output_format' => (isa => 'OutputFormat', default => 'TextOutputFormat'); has 'output_compression' => (isa => 'Bool', default => 0); has 'hadoop_prefix' => (isa => 'Str', default => '/SGE/HADOOP/active'); has 'keep_env' => (isa => 'ArrayRef[Str]', default => sub { ["PERLLIB", "PERL5LIB"] }); sub run();
mapper
– aHadoop::Mapper
to usereducer
– an optionalHadoop::Reducer
to usecombiner
– an optionalHadoop::Reducer
to use as combinerpartitioner
– an optionalHadoop::Partitioner
to useinput_format
– one ofTextInputFormat
,KeyValueTextInputFormat
,SequenceFileInputFormat
output_format
– one ofTextOutputFormat
,SequenceFileOutputFormat
output_compression
– Bool flag controlling the compression of outputhadoop_prefix
– the prefix of Hadoop instalation. Default value is fine in UFAL cluster.keep_env
– which environment variables are preserved when running perl mappers, reducers, combiners and partitioners
Command line arguments supported by Hadoop::Runner::run()
Hadoop::Mapper
package Hadoop::Mapper; use Moose::Role; requires 'map'; sub setup() {} sub cleanup {}
sub map($self, $key, $value, $context)
– executed for every (key, value) input pair. The variable '$content' has following methods:$content->write($key, $value)
– output the ($key
,$value
) pair$content->counter($group, $name, $increment)
– increases the counter$name
in the group$group
by$increment
sub setup($self, $context)
– executed once before any input (key, value) pairs are processed. The$context
can be used to both produce (key, value) pairs and increment counters.sub cleanup($self, $context)
– executed once after all input (key, value) pairs are processed. The$context
can be used to both produce (key, value) pairs and increment counters.
Hadoop::Reducer
package Hadoop::Reduce; use Moose::Role; requires 'reduce'; sub setup() {} sub cleanup {}
sub reduce($self, $key, $values, $context)
– executed for every$key
. The$values
is an iterator with the following methods:$values->value()
– returns the current value, undef if there is any.$values->next()
– advance to next value. Returns true if there is any, false otherwise.- At the beginning there is no current value, the first value should be obtained by calling 'next'.
sub reduce($self, $key, $values, $context)
– the variable$content
has following methods:$content->write($key, $value)
– output the ($key
,$value
) pair$content->counter($group, $name, $increment)
– increases the counter$name
in the group$group
by$increment
sub setup($self, $context)
– executed once before any input keys are processed. The$context
can be used to both produce (key, value) pairs and increment counters.sub cleanup($self, $context)
– executed once after all input keys are processed. The$context
can be used to both produce (key, value) pairs and increment counters.
Hadoop::Partitioner
package Hadoop::Partitioner; use Moose::Role; requires 'getPartition'; sub setup {} sub cleanup {}
sub getPartition($self, $key, $value, $partitions)
– executed for every output (key, value) pair. It must return a number of partition in range 0..$partitions-1, where the output (key, value) pair should be placed.sub setup($self)
– executed once before any input (key, value) pairs are processed.sub cleanup($self)
– executed once after all input (key, value) pairs are processed.