# A MR job counts occurrences of every word in the article texts.
# The script produce results in reducers manually using the mentioned 
# environmental variables, and execute it using four reducers.
#
# rm -rf step-12-out-sol; perl step-12-solution.pl -c 5 -r 4 /home/straka/wiki/cs-text-medium/ step-12-out-sol
# less step-12-out-sol/vystup-*

package My::Mapper;
use Moose;
with 'Hadoop::Mapper';

sub map {
  my ($self, $key, $value, $context) = @_;

  foreach my $word (split /\W/, $value) {
    next if not length $word;
    $context->write($word, 1);
  }
}

package My::Reducer;
use Moose;
with 'Hadoop::Reducer';

has 'outfile' => (is => 'rw', isa => 'FileHandle');

sub setup {
  my $self = shift;

  my $fname = "$ENV{HADOOP_WORK_OUTPUT_PATH}/vystup-$ENV{HADOOP_TASK_ID}.txt";
  my $outfile;
  open($outfile, ">:utf8", $fname) or die "Cannot open $fname for writing";
  $self->outfile($outfile);
}

sub reduce {
  my ($self, $key, $values, $context) = @_;

  my $sum = 0;
  while ($values->next) {
    $sum += $values->value;
  }

  print {$self->outfile} "$key\t$sum\n";
}

sub cleanup {
  my $self = shift;

  close $self->outfile;
}

package main;
use Hadoop::Runner;

my $runner = Hadoop::Runner->new(
  mapper => My::Mapper->new(),
  reducer => My::Reducer->new(),
  input_format => 'KeyValueTextInputFormat');

$runner->run();