# A MR job counts occurrences of every word in the article texts.
# Create a hash of word occurrences, populate it during the map calls without 
# outputting results and finally output all (key, value) pairs in the cleanup 
# method. 
#
# rm -rf step-11-out-sol; perl step-11-solution.pl /home/straka/wiki/cs-text-medium/ step-11-out-sol
# less step-11-out-sol/part-*

package My::Mapper;
use Moose;
with 'Hadoop::Mapper';

has 'wds' => (is => 'rw', isa => 'HashRef');

sub setup {
  my $self = shift;
  $self->wds({});
}

sub map {
  my ($self, $key, $value, $context) = @_;

  foreach my $word (split /\W/, $value) {
    next if not length $word;
    $self->wds->{$word}++;
  }
}

sub cleanup {
  my ($self, $context) = @_;

  while (my ($key, $value) = each %{$self->wds()}) {
    $context->write($key, $value);
  }
}

package My::Reducer;
use Moose;
with 'Hadoop::Reducer';

sub reduce {
  my ($self, $key, $values, $context) = @_;

  my $sum = 0;
  while ($values->next) {
    $sum += $values->value;
  }

  $context->write($key, $sum);
}

package main;
use Hadoop::Runner;

my $runner = Hadoop::Runner->new(
  mapper => My::Mapper->new(),
  reducer => My::Reducer->new(),
  input_format => 'KeyValueTextInputFormat');

$runner->run();