# A MR job counts occurrences of every word in the article texts. # The script produce results in reducers manually using the mentioned # environmental variables, and execute it using four reducers. # # rm -rf step-12-out-sol; perl step-12-solution.pl -c 5 -r 4 /home/straka/wiki/cs-text-medium/ step-12-out-sol # less step-12-out-sol/vystup-* package My::Mapper; use Moose; with 'Hadoop::Mapper'; sub map { my ($self, $key, $value, $context) = @_; foreach my $word (split /\W/, $value) { next if not length $word; $context->write($word, 1); } } package My::Reducer; use Moose; with 'Hadoop::Reducer'; has 'outfile' => (is => 'rw', isa => 'FileHandle'); sub setup { my $self = shift; my $fname = "$ENV{HADOOP_WORK_OUTPUT_PATH}/vystup-$ENV{HADOOP_TASK_ID}.txt"; my $outfile; open($outfile, ">:utf8", $fname) or die "Cannot open $fname for writing"; $self->outfile($outfile); } sub reduce { my ($self, $key, $values, $context) = @_; my $sum = 0; while ($values->next) { $sum += $values->value; } print {$self->outfile} "$key\t$sum\n"; } sub cleanup { my $self = shift; close $self->outfile; } package main; use Hadoop::Runner; my $runner = Hadoop::Runner->new( mapper => My::Mapper->new(), reducer => My::Reducer->new(), input_format => 'KeyValueTextInputFormat'); $runner->run();