Both sides previous revision
Previous revision
Next revision
|
Previous revision
|
courses:mapreduce-tutorial:step-16 [2012/02/06 09:01] straka |
courses:mapreduce-tutorial:step-16 [2012/02/06 13:29] (current) straka |
* ''allReduce($array_ref, $reduce_op)'' -- each machine provides multiple values. The ''$array_ref'' can contain numbers and more array references. The reduced values are stored in the original array in every machine. | * ''allReduce($array_ref, $reduce_op)'' -- each machine provides multiple values. The ''$array_ref'' can contain numbers and more array references. The reduced values are stored in the original array in every machine. |
The ''$reduce_op'' can currently be one on: | The ''$reduce_op'' can currently be one on: |
* ''$self->REDUCE_ADD'' | * ''$self%%->%%REDUCE_ADD'' |
* ''$self->REDUCE_MIN'' | * ''$self%%->%%REDUCE_MIN'' |
* ''$self->REDUCE_MAX'' | * ''$self%%->%%REDUCE_MAX'' |
| |
It is crucial that all the mappers run simultaneously. This can be achieved using the ''/net/projects/hadoop/bin/compute-splitsize'' script: for given Hadoop input and requested number of mappers, it computes the appropriate splitsize. | It is crucial that all the mappers run simultaneously. This can be achieved using the ''/net/projects/hadoop/bin/compute-splitsize'' script: for given Hadoop input and requested number of mappers, it computes the appropriate splitsize. |
===== Example ===== | ===== Example ===== |
This example reads the keys of ''/net/projects/hadoop/examples/inputs/numbers-small'', computes the sum of all the keys and print it: | This example reads the keys of ''/net/projects/hadoop/examples/inputs/numbers-small'', computes the sum of all the keys and print it: |
<code java Sum.java> | <code perl sum.pl> |
import org.apache.hadoop.mapreduce.*; | package My::AllReduceMapper; |
import org.apache.hadoop.mapreduce.lib.allreduce.*; | use Moose; |
import org.apache.hadoop.mapreduce.lib.input.*; | with 'Hadoop::AllReduceMapper'; |
import org.apache.hadoop.mapreduce.lib.output.*; | |
import org.apache.hadoop.util.*; | |
| |
public class Sum extends Configured implements Tool { | use List::Util qw(sum); |
public static class TheMapper extends AllReduceMapper<Text, Text, DoubleWritable, NullWritable>{ | |
int[] points = new int[64]; | |
int points_num = 0; | |
| |
public void map(Text key, Text value, Context context) throws IOException, InterruptedException { | has 'keys' => (is => 'rw', isa => 'ArrayRef[Num]', default => sub { [] }); |
if (points_num == points.length) { | |
int[] new_points = new int[2*points_num]; | |
System.arraycopy(points, 0, new_points, 0, points_num); | |
points = new_points; | |
} | |
| |
points[points_num++] = Integer.parseInt(key.toString()); | sub map { |
} | my ($self, $key, $value, $context) = @_; |
| |
public void cooperate(Context context, boolean writeResults) throws IOException, InterruptedException { | push @{$self->keys}, $key; |
double sum = 0; | } |
for (int i = 0; i < points_num; i++) sum += points[i]; | |
| |
double total_sum = allReduce(context, sum, REDUCE_ADD); | sub cooperate { |
| my ($self, $context, $writeResults) = @_; |
| |
if (writeResults) context.write(new DoubleWritable(total_sum), NullWritable.get()); | my $sum = sum @{$self->keys}; |
} | $self->allReduce(\$sum, $self->REDUCE_ADD); |
} | |
| |
// Job configuration | $context->write($sum) if $writeResults; |
public int run(String[] args) throws Exception { | } |
if (args.length < 2) { | |
System.err.printf("Usage: %s.jar in-path out-path", this.getClass().getName()); | |
return 1; | |
} | |
| |
Job job = new Job(getConf(), this.getClass().getName()); | package main; |
| use Hadoop::Runner; |
| |
job.setJarByClass(this.getClass()); | my $runner = Hadoop::Runner->new( |
job.setMapperClass(TheMapper.class); | mapper => My::AllReduceMapper->new(), |
AllReduce.init(job); | input_format => 'KeyValueTextInputFormat'); |
job.setOutputKeyClass(DoubleWritable.class); | |
job.setOutputValueClass(NullWritable.class); | |
| |
job.setInputFormatClass(KeyValueTextInputFormat.class); | $runner->run(); |
| |
FileInputFormat.addInputPath(job, new Path(args[0])); | |
FileOutputFormat.setOutputPath(job, new Path(args[1])); | |
| |
return job.waitForCompletion(true) ? 0 : 1; | |
} | |
| |
// Main method | |
public static void main(String[] args) throws Exception { | |
int res = ToolRunner.run(new Sum(), args); | |
| |
System.exit(res); | |
} | |
} | |
</code> | </code> |
| |
You can run the example locally using: | You can run the example locally using: |
wget --no-check-certificate 'https://wiki.ufal.ms.mff.cuni.cz/_export/code/courses:mapreduce-tutorial:step-31?codeblock=0' -O Sum.java | wget --no-check-certificate 'https://wiki.ufal.ms.mff.cuni.cz/_export/code/courses:mapreduce-tutorial:step-16?codeblock=0' -O sum.pl |
make -f /net/projects/hadoop/java/Makefile Sum.java | rm -rf step-16-out; perl sum.pl /net/projects/hadoop/examples/inputs/numbers-small step-16-out |
rm -rf step-31-out; /net/projects/hadoop/bin/hadoop Sum.jar /net/projects/hadoop/examples/inputs/numbers-small step-31-out | less step-16-out/part-* |
less step-31-out/part-* | |
| |
To run on a cluster with //C// machines using //C// mappers: | To run on a cluster with //C// machines using //C// mappers: |
rm -rf step-31-out; /net/projects/hadoop/bin/hadoop Sum.jar -c C `/net/projects/hadoop/bin/compute-splitsize /net/projects/hadoop/examples/inputs/numbers-small C` /net/projects/hadoop/examples/inputs/numbers-small step-31-out | rm -rf step-16-out; M=#of_machines; INPUT=/net/projects/hadoop/examples/inputs/numbers-small; perl sum.pl -c $M `/net/projects/hadoop/bin/compute-splitsize $INPUT $M` $INPUT step-16-out |
less step-31-out/part-* | less step-16-out/part-* |
| |
===== Exercise 1 ===== | ===== Exercise 1 ===== |
* minimum of the keys | * minimum of the keys |
* maximum of the keys | * maximum of the keys |
You can download the template {{:courses:mapreduce-tutorial:step-31-exercise1.txt|Statistics.java}} and execute it using: | You can download the template {{:courses:mapreduce-tutorial:step-16-exercise1.txt|statistics.pl}} and execute it using: |
wget --no-check-certificate 'https://wiki.ufal.ms.mff.cuni.cz/_media/courses:mapreduce-tutorial:step-31-exercise1.txt' -O Statistics.java | wget --no-check-certificate 'https://wiki.ufal.ms.mff.cuni.cz/_media/courses:mapreduce-tutorial:step-16-exercise1.txt' -O statistics.pl |
# NOW VIEW THE FILE | # NOW VIEW THE FILE |
# $EDITOR Statistics.java | # $EDITOR statistics.pl |
make -f /net/projects/hadoop/java/Makefile Statistics.java | rm -rf step-16-out; M=#of_machines; INPUT=/net/projects/hadoop/examples/inputs/numbers-small; perl statistics.pl -c $M `/net/projects/hadoop/bin/compute-splitsize $INPUT $M` $INPUT step-16-out |
rm -rf step-31-out; /net/projects/hadoop/bin/hadoop Statistics.jar -c C `/net/projects/hadoop/bin/compute-splitsize /net/projects/hadoop/examples/inputs/numbers-small C` /net/projects/hadoop/examples/inputs/numbers-small step-31-out | less step-16-out/part-* |
less step-31-out/part-* | |
| |
===== Exercise 2 ===== | ===== Exercise 2 ===== |
- Else, set //min<sub>i+1</sub>// = //split//+1 and subtract from //index_to_find// the number of keys less or equal to //split//. | - Else, set //min<sub>i+1</sub>// = //split//+1 and subtract from //index_to_find// the number of keys less or equal to //split//. |
| |
You can download the template {{:courses:mapreduce-tutorial:step-31-exercise2.txt|Median.java}} and execute it using: | You can download the template {{:courses:mapreduce-tutorial:step-16-exercise2.txt|median.pl}} and execute it using: |
wget --no-check-certificate 'https://wiki.ufal.ms.mff.cuni.cz/_media/courses:mapreduce-tutorial:step-31-exercise2.txt' -O Median.java | wget --no-check-certificate 'https://wiki.ufal.ms.mff.cuni.cz/_media/courses:mapreduce-tutorial:step-16-exercise2.txt' -O median.pl |
# NOW VIEW THE FILE | # NOW VIEW THE FILE |
# $EDITOR Median.java | # $EDITOR median.pl |
make -f /net/projects/hadoop/java/Makefile Median.java | rm -rf step-16-out; M=#of_machines; INPUT=/net/projects/hadoop/examples/inputs/numbers-small; perl median.pl -c $M `/net/projects/hadoop/bin/compute-splitsize $INPUT $M` $INPUT step-16-out |
rm -rf step-31-out; /net/projects/hadoop/bin/hadoop Median.jar -c C `/net/projects/hadoop/bin/compute-splitsize /net/projects/hadoop/examples/inputs/numbers-small C` /net/projects/hadoop/examples/inputs/numbers-small step-31-out | less step-16-out/part-* |
less step-31-out/part-* | |
| |
Solution: {{:courses:mapreduce-tutorial:step-31-solution2.txt|Median.java}}. | Solution: {{:courses:mapreduce-tutorial:step-16-solution2.txt|median.pl}}. |
| |
===== Exercise 3 ===== | ===== Exercise 3 ===== |
Implement an AllReduce job on ''/net/projects/hadoop/examples/inputs/points-small'', which implements the [[http://en.wikipedia.org/wiki/K-means_clustering#Standard_algorithm|K-means clustering algorithm]]. See [[.:step-15|K-means clustering exercise]] for description of input data. | Implement an AllReduce job on ''/net/projects/hadoop/examples/inputs/points-small'', which implements the [[http://en.wikipedia.org/wiki/K-means_clustering#Standard_algorithm|K-means clustering algorithm]]. See [[.:step-15|K-means clustering exercise]] for description of input data. |
| |
You can download the template {{:courses:mapreduce-tutorial:step-31-exercise3.txt|KMeans.java}}. This template uses two Hadoop properties: | You can download the template {{:courses:mapreduce-tutorial:step-16-exercise3.txt|kmeans.pl}}. This template uses two environment variables: |
* ''clusters.num'' -- number of clusters | * ''CLUSTERS_NUM'' -- number of clusters |
* ''clusters.file'' -- file where to read the initial clusters from | * ''CLUSTERS_FILE'' -- file where to read the initial clusters from |
You can download and compile it using: | You can download and compile it using: |
wget --no-check-certificate 'https://wiki.ufal.ms.mff.cuni.cz/_media/courses:mapreduce-tutorial:step-31-exercise3.txt' -O KMeans.java.java | wget --no-check-certificate 'https://wiki.ufal.ms.mff.cuni.cz/_media/courses:mapreduce-tutorial:step-16-exercise3.txt' -O kmeans.pl |
# NOW VIEW THE FILE | # NOW VIEW THE FILE |
# $EDITOR KMeans.java.java | # $EDITOR kmeans.pl |
make -f /net/projects/hadoop/java/Makefile KMeans.java.java | |
You can run it using specified number of machines on the following input data: | You can run it using specified number of machines on the following input data: |
* ''/net/projects/hadoop/examples/inputs/points-small'': | * ''/net/projects/hadoop/examples/inputs/points-small'': |
<code>M=machines; K=50; INPUT=/net/projects/hadoop/examples/inputs/points-small/points.txt | <code>M=#of_machines; export CLUSTERS_NUM=50 CLUSTERS_FILE=/net/projects/hadoop/examples/inputs/points-small/points.txt |
rm -rf step-31-out; /net/projects/hadoop/bin/hadoop KMeans.java.jar -Dclusters.num=$K -Dclusters.file=$INPUT [-jt jobtracker | -c $M] `/net/projects/hadoop/bin/compute-splitsize $INPUT $M` $INPUT step-31-out</code> | rm -rf step-16-out; perl kmeans.pl -c $M `/net/projects/hadoop/bin/compute-splitsize $CLUSTERS_FILE $M` $CLUSTERS_FILE step-16-out</code> |
* ''/net/projects/hadoop/examples/inputs/points-medium'': | * ''/net/projects/hadoop/examples/inputs/points-medium'': |
<code>M=machines; K=100; INPUT=/net/projects/hadoop/examples/inputs/points-medium/points.txt | <code>M=#of_machines; export CLUSTERS_NUM=100 CLUSTERS_FILE=/net/projects/hadoop/examples/inputs/points-medium/points.txt |
rm -rf step-31-out; /net/projects/hadoop/bin/hadoop KMeans.java.jar -Dclusters.num=$K -Dclusters.file=$INPUT [-jt jobtracker | -c $M] `/net/projects/hadoop/bin/compute-splitsize $INPUT $M` $INPUT step-31-out</code> | rm -rf step-16-out; perl kmeans.pl -c $M `/net/projects/hadoop/bin/compute-splitsize $CLUSTERS_FILE $M` $CLUSTERS_FILE step-16-out</code> |
* ''/net/projects/hadoop/examples/inputs/points-large'': | * ''/net/projects/hadoop/examples/inputs/points-large'': |
<code>M=machines; K=200; INPUT=/net/projects/hadoop/examples/inputs/points-large/points.txt | <code>M=#of_machines; export CLUSTERS_NUM=200 CLUSTERS_FILE=/net/projects/hadoop/examples/inputs/points-large/points.txt |
rm -rf step-31-out; /net/projects/hadoop/bin/hadoop KMeans.java.jar -Dclusters.num=$K -Dclusters.file=$INPUT [-jt jobtracker | -c $M] `/net/projects/hadoop/bin/compute-splitsize $INPUT $M` $INPUT step-31-out</code> | rm -rf step-16-out; perl kmeans.pl -c $M `/net/projects/hadoop/bin/compute-splitsize $CLUSTERS_FILE $M` $CLUSTERS_FILE step-16-out</code> |
| |
Solution: {{:courses:mapreduce-tutorial:step-31-solution3.txt|KMeans.java}}. | Solution: {{:courses:mapreduce-tutorial:step-16-solution3.txt|kmeans.pl}}, much faster solution with distance computations written in C: {{:courses:mapreduce-tutorial:step-16-solution3_c.txt|kmeans_C.pl}}. |
| |