# A MR job counts occurrences of every word in the article texts. # Create a hash of word occurrences, populate it during the map calls without # outputting results and finally output all (key, value) pairs in the cleanup # method. # # rm -rf step-11-out-sol; perl step-11-solution.pl /home/straka/wiki/cs-text-medium/ step-11-out-sol # less step-11-out-sol/part-* package My::Mapper; use Moose; with 'Hadoop::Mapper'; has 'wds' => (is => 'rw', isa => 'HashRef'); sub setup { my $self = shift; $self->wds({}); } sub map { my ($self, $key, $value, $context) = @_; foreach my $word (split /\W/, $value) { next if not length $word; $self->wds->{$word}++; } } sub cleanup { my ($self, $context) = @_; while (my ($key, $value) = each %{$self->wds()}) { $context->write($key, $value); } } package My::Reducer; use Moose; with 'Hadoop::Reducer'; sub reduce { my ($self, $key, $values, $context) = @_; my $sum = 0; while ($values->next) { $sum += $values->value; } $context->write($key, $sum); } package main; use Hadoop::Runner; my $runner = Hadoop::Runner->new( mapper => My::Mapper->new(), reducer => My::Reducer->new(), input_format => 'KeyValueTextInputFormat'); $runner->run();