This is an old revision of the document!
Table of Contents
MapReduce Tutorial - Perl API
Hadoop::Runner
package Hadoop::Runner; use Moose; has 'mapper' => (does => 'Hadoop::Mapper', required => 1); has 'reducer' => (does => 'Hadoop::Reducer'); has 'combiner' => (does => 'Hadoop::Reducer'); has 'partitioner' => (does => 'Hadoop::Partitioner'); has 'input_format' => (isa => 'InputFormat', default => 'TextInputFormat'); has 'output_format' => (isa => 'OutputFormat', default => 'TextOutputFormat'); has 'output_compression' => (isa => 'Bool', default => 0); has 'hadoop_prefix' => (isa => 'Str', default => '/SGE/HADOOP/active'); has 'keep_env' => (isa => 'ArrayRef[Str]', default => sub { ["PERLLIB", "PERL5LIB"] }); sub run();
mapper
– aHadoop::Mapper
to usereducer
– an optionalHadoop::Reducer
to usecombiner
– an optionalHadoop::Reducer
to use as combinerpartitioner
– an optionalHadoop::Partitioner
to useinput_format
– one ofTextInputFormat
,KeyValueTextInputFormat
,SequenceFileInputFormat
output_format
– one ofTextOutputFormat
,SequenceFileOutputFormat
output_compression
– Bool flag controlling the compression of outputhadoop_prefix
– the prefix of Hadoop instalation. Default value is fine in UFAL cluster.keep_env
– which environment variables are preserved when running perl mappers, reducers, combiners and partitioners
Hadoop::Mapper
package Hadoop::Mapper; use Moose::Role; requires 'map'; sub setup() { 1; } sub cleanup { 1;}
sub map($self, $key, $value, $context)
– executed for every (key, value) input pair. The variable '$content' has following methods:$content->write($key, $value)
– output the ($key
,$value
) pair$content->counter($group, $name, $increment)
– increases the counter$name
in the group$group
by$increment