<?xml version="1.0" encoding="UTF-8"?>
<!-- generator="FeedCreator 1.8" -->
<?xml-stylesheet href="https://wiki.ufal.ms.mff.cuni.cz/lib/exe/css.php?s=feed" type="text/css"?>
<rdf:RDF
    xmlns="http://purl.org/rss/1.0/"
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
    xmlns:dc="http://purl.org/dc/elements/1.1/">
    <channel rdf:about="https://wiki.ufal.ms.mff.cuni.cz/feed.php">
        <title>ufal wiki courses:mapreduce-tutorial</title>
        <description></description>
        <link>https://wiki.ufal.ms.mff.cuni.cz/</link>
        <image rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/lib/tpl/ufal/images/favicon.ico" />
       <dc:date>2026-04-20T06:25:45+00:00</dc:date>
        <items>
            <rdf:Seq>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:hadoop-job-overview?rev=1328505109&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:if-things-go-wrong?rev=1328532923&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:introduction?rev=1326661331&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:making-your-job-configurable?rev=1328474721&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:managing-a-hadoop-cluster?rev=1360333511&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:perl-api?rev=1327999094&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:running-jobs?rev=1360330413&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-1?rev=1327933553&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-2?rev=1327849433&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-3?rev=1327999229&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-4?rev=1327999239&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-5?rev=1328021763&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-6?rev=1328532937&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-7?rev=1360330601&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-8?rev=1328021744&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-9?rev=1327999326&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-10?rev=1327999119&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-11?rev=1327999143&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-12?rev=1327999154&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-13?rev=1328021679&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-14?rev=1328022505&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-15?rev=1327851612&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-16?rev=1328531390&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-21?rev=1328005139&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-22?rev=1328003187&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-23?rev=1328016817&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-24?rev=1328023508&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-25?rev=1328019157&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-26?rev=1328017277&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-27?rev=1328017146&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-28?rev=1328465421&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-29?rev=1328465656&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-30?rev=1328010077&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-31?rev=1328536369&amp;do=diff"/>
            </rdf:Seq>
        </items>
    </channel>
    <image rdf:about="https://wiki.ufal.ms.mff.cuni.cz/lib/tpl/ufal/images/favicon.ico">
        <title>ufal wiki</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/</link>
        <url>https://wiki.ufal.ms.mff.cuni.cz/lib/tpl/ufal/images/favicon.ico</url>
    </image>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:hadoop-job-overview?rev=1328505109&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-02-06T06:11:49+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:hadoop-job-overview</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:hadoop-job-overview?rev=1328505109&amp;do=diff</link>
        <description>MapReduce Tutorial : Hadoop job overview

A regular Hadoop job consists of:

	*  [required] a mapper -- processes input (key, value) pairs, produces (key, value) pairs. There can be multiple mappers: each file is divided into (by default 32MB) splits and each split is processed by one mapper. Script</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:if-things-go-wrong?rev=1328532923&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-02-06T13:55:23+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:if-things-go-wrong</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:if-things-go-wrong?rev=1328532923&amp;do=diff</link>
        <description>MapReduce Tutorial : If things go wrong

A lot can go wrong in the process of creating cluster and submitting the Hadoop job:

	*  Hadoop::Runner.pm module not found: The Perl Hadoop package is not configured, see Setting the environment.
	*  ipc.Client: Retrying connect to server: IP_ADDRESS:PORT. Already tried ? time(s)</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:introduction?rev=1326661331&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-15T22:02:11+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:introduction</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:introduction?rev=1326661331&amp;do=diff</link>
        <description>Large data processing using MapReduce

For an introduction, it is best to read the original paper.
There are also Czech slides (up to slide 45).

There are nice slides from the three-day course available at &lt;http://sites.google.com/site/mriap2008/lectures&gt;.
I would suggest to start with &lt;http://sites.google.com/site/mriap2008/intro_to_mapreduce.pdf&gt; .

Now is good time to solve the following exercises:</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:making-your-job-configurable?rev=1328474721&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-02-05T21:45:21+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:making-your-job-configurable</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:making-your-job-configurable?rev=1328474721&amp;do=diff</link>
        <description>MapReduce Tutorial : Making your job configurable

Sometimes it is desirable for a Hadoop job to be configurable without recompiling/rewriting the source. This can be achieved:

	*  Java: use Hadoop properties:
		*  when running the job, use /net/projects/hadoop/bin/hadoop job.jar -Dname1=value1 -Dname2=value2</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:managing-a-hadoop-cluster?rev=1360333511&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2013-02-08T15:25:11+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:managing-a-hadoop-cluster</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:managing-a-hadoop-cluster?rev=1360333511&amp;do=diff</link>
        <description>MapReduce Tutorial : Managing a Hadoop cluster

Hadoop clusters can be created and stopped dynamically, using the SGE cluster. A Hadoop cluster consists of one jobtracker (master of the cluster) and multiple tasktrackers. The cluster is identified by its jobtracker. The jobtracker listens on two ports</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:perl-api?rev=1327999094&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T09:38:14+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:perl-api</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:perl-api?rev=1327999094&amp;do=diff</link>
        <description>MapReduce Tutorial - Perl API

Hadoop::Runner


package Hadoop::Runner;
use Moose;

has 'mapper' =&gt; (does =&gt; 'Hadoop::Mapper', required =&gt; 1);
has 'reducer' =&gt; (does =&gt; 'Hadoop::Reducer');
has 'combiner' =&gt; (does =&gt; 'Hadoop::Reducer');
has 'partitioner' =&gt; (does =&gt; 'Hadoop::Partitioner');

has 'input_format' =&gt; (isa =&gt; 'InputFormat', default =&gt; 'TextInputFormat');
has 'output_format' =&gt; (isa =&gt; 'OutputFormat', default =&gt; 'TextOutputFormat');
has 'output_compression' =&gt; (isa =&gt; 'Bool', default =&gt;…</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:running-jobs?rev=1360330413&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2013-02-08T14:33:33+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:running-jobs</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:running-jobs?rev=1360330413&amp;do=diff</link>
        <description>MapReduce Tutorial : Running jobs

The input of a Hadoop job is either a file, or a directory. In latter case all files in the directory are processed.

The output of a Hadoop job must be a directory, which does not exist.

Running jobs
  Command  Run</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-1?rev=1327933553&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-30T15:25:53+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-1</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-1?rev=1327933553&amp;do=diff</link>
        <description>MapReduce Tutorial : Setting the environment

Requirements

The tutorial expects you to be logged to a computer in the UFAL cluster and be able to submit jobs using SGE. In this environment, Hadoop is installed in /SGE/HADOOP/active.

To use the Perl</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-2?rev=1327849433&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-29T16:03:53+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-2</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-2?rev=1327849433&amp;do=diff</link>
        <description>MapReduce tutorial : Input and output format, testing data.

The MapReduce framework is frequently using (key, value) pairs. These pairs can be read from a file and written to a file and there are several formats available.

Input formats

	*  TextInputFormat</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-3?rev=1327999229&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T09:40:29+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-3</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-3?rev=1327999229&amp;do=diff</link>
        <description>MapReduce Tutorial : Basic mapper

The simplest Hadoop job consists of a mapper only.  The input data is divided in several parts, every processed by an independent mapper, and the results are collected in one directory, one file per mapper.

The Hadoop framework silently handles failures. If a mapper task fails, another is executed and the input of the failed attempt is discarded.</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-4?rev=1327999239&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T09:40:39+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-4</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-4?rev=1327999239&amp;do=diff</link>
        <description>MapReduce Tutorial : Counters

Sometimes it is useful to count events differently than outputting them as (key, value) pairs. For that reason Hadoop offers simple counter framework.

Hadoop maintains a collection of pre-defined and user-defined counters. Every counter is identified by its group name and counter name. The group name and counter name is an arbitrary string</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-5?rev=1328021763&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T15:56:03+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-5</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-5?rev=1328021763&amp;do=diff</link>
        <description>MapReduce Tutorial : Basic reducer

The interesting part of a Hadoop job is the reducer -- after all mappers produce the (key, value) pairs, for every unique key and all its values a reduce function is called. The reduce function can output (key, value) pairs, which are written to disk.</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-6?rev=1328532937&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-02-06T13:55:37+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-6</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-6?rev=1328532937&amp;do=diff</link>
        <description>MapReduce Tutorial : Running on cluster

Probably the most important feature of MapReduce is to run computations distributively.

So far all our Hadoop jobs were executed locally. But all of them can be executed on multiple machines. It suffices to add parameter</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-7?rev=1360330601&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2013-02-08T14:36:41+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-7</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-7?rev=1360330601&amp;do=diff</link>
        <description>MapReduce Tutorial : Dynamic Hadoop cluster for several computations

When multiple Hadoop jobs should be executed, it is better to reuse the cluster instead of allocating a new one for every computation.

A cluster can be created using
/net/projects/hadoop/bin/hadoop-cluster -c number_of_machines -w sec_to_wait_after_all_jobs_completed</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-8?rev=1328021744&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T15:55:44+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-8</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-8?rev=1328021744&amp;do=diff</link>
        <description>MapReduce Tutorial : Multiple mappers, reducers and partitioning

A Hadoop job, which is expected to run on many computers at the same time, need to use multiple mappers and reducers. It is possible to control these numbers to some degree.

Multiple mappers</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-9?rev=1327999326&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T09:42:06+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-9</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-9?rev=1327999326&amp;do=diff</link>
        <description>MapReduce Tutorial : Hadoop properties

We have controlled the Hadoop jobs using the Perl API so far, which is quite limited.

The Hadoop itself uses many configuration options. The options can be set on command line using the -Dname=value syntax:
perl script.pl [-jt cluster_master | -c cluster_size [-w sec_to_wait]] [-r number_of_reducers] [-Dname=value -Dname=value ...] input output_path</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-10?rev=1327999119&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T09:38:39+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-10</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-10?rev=1327999119&amp;do=diff</link>
        <description>MapReduce Tutorial : Combiners

Sometimes the reduce is a binary operation, which is associative and commutative, e.g. +. In that case it is inefficient to produce all the (key, value) pairs in the mappers and send them through the network.

Instead, reducer can be executed right after the map, on</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-11?rev=1327999143&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T09:39:03+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-11</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-11?rev=1327999143&amp;do=diff</link>
        <description>MapReduce Tutorial : Initialization and cleanup of MR tasks, performance of combiners

During the mapper or reducer task execution the following steps take place:

	*  Perl script is executed in the current directory, ie. in the directory where the job was executed / submitted from.</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-12?rev=1327999154&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T09:39:14+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-12</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-12?rev=1327999154&amp;do=diff</link>
        <description>MapReduce Tutorial : Additional output from mappers and reducers

Sometimes it would be useful to create output files manually in reducers -- either multiple files are needed per reducer, or a specific file format is desired.

Problem is that Hadoop framework can spawn several task attempts for the same reducer task</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-13?rev=1328021679&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T15:54:39+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-13</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-13?rev=1328021679&amp;do=diff</link>
        <description>MapReduce Tutorial : Exercise - sorting

You are given data consisting of (31-bit integer, string data) pairs. These are available in plain text format:
 Path  Size  /net/projects/hadoop/examples/inputs/numbers-small  3MB  /net/projects/hadoop/examples/inputs/numbers-medium</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-14?rev=1328022505&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T16:08:25+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-14</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-14?rev=1328022505&amp;do=diff</link>
        <description>MapReduce Tutorial : Exercise - N-gram language model

For a given N create a simple N-gram language model. You can start experimenting on the following data:
 Path  Size  /home/straka/wiki/cs-seq-medium  8MB  /home/straka/wiki/cs-seq  82MB  /home/straka/wiki/en-seq</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-15?rev=1327851612&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-29T16:40:12+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-15</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-15?rev=1327851612&amp;do=diff</link>
        <description>MapReduce Tutorial : K-means clustering

Implement the K-means clustering algorithm. You can use the following data:
 Path  Number of points  Number of dimensions  Number of clusters  /net/projects/hadoop/examples/inputs/points-small  10000  50  50  /net/projects/hadoop/examples/inputs/points-medium</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-16?rev=1328531390&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-02-06T13:29:50+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-16</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-16?rev=1328531390&amp;do=diff</link>
        <description>MapReduce Tutorial: Implementing iterative MapReduce jobs faster using All-Reduce

Implementing an iterative computation by running a separate Hadoop job for every iteration is usually not very efficient (although it is fault tolerant).

If we have enough machines that all input data fits into memory, we can implement iterative computation like this:</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-21?rev=1328005139&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T11:18:59+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-21</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-21?rev=1328005139&amp;do=diff</link>
        <description>MapReduce Tutorial : Preparing the environment

To use the Hadoop Java API, you must be able to compile the Java sources with the Hadoop library. An easy way is to use a prepared Makefile:

	*  Create a directory for the Java sources.
	*  Create a Makefile</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-22?rev=1328003187&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T10:46:27+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-22</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-22?rev=1328003187&amp;do=diff</link>
        <description>MapReduce Tutorial : Setting Eclipse

This is not well tested.

If you do not like VIM, you can try using Eclipse as a Java editor. You should

	*  Download /SGE/HADOOP/active/hadoop-core-1.0.1-SNAPSHOT.jar
	*  Download the directory /net/projects/hadoop/java-extensions/classes</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-23?rev=1328016817&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T14:33:37+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-23</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-23?rev=1328016817&amp;do=diff</link>
        <description>MapReduce Tutorial : Predefined formats and types

Currently there are two different Java APIs:

	*  org.apache.hadoop.mapred: This is the original API, which is currently deprecated.
	*  org.apache.hadoop.mapreduce: This is the new API, which we will be using in this tutorial. The only problem is that some library classes have not yet been converted to use the new</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-24?rev=1328023508&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T16:25:08+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-24</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-24?rev=1328023508&amp;do=diff</link>
        <description>MapReduce Tutorial : Mappers, running Java Hadoop jobs, counters

We start by going through a simple Hadoop job with Mapper only.

A mapper which processes (key, value) pairs of types (Kin, Vin) and produces (key, value) pairs of types (Kout, Vout) must be a subclass of</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-25?rev=1328019157&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T15:12:37+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-25</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-25?rev=1328019157&amp;do=diff</link>
        <description>MapReduce Tutorial : Reducers, combiners and partitioners.

A reducer in a Hadoop job must be a subclass of Reducer&lt;Kin, Vin, Kout, Vout&gt;.

As in the Perl API, any reducer can be used as a combiner.

Here is a Hadoop job computing the number of occurrences of all words:


import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.map…</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-26?rev=1328017277&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T14:41:17+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-26</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-26?rev=1328017277&amp;do=diff</link>
        <description>MapReduce Tutorial : Compression and job configuration

Compression

The output files can be compressed using


  FileOutputFormat.setCompressOutput(job, true);


The default compression format is deflate -- raw Zlib compression. Several other compression formats can be selected:</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-27?rev=1328017146&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T14:39:06+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-27</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-27?rev=1328017146&amp;do=diff</link>
        <description>MapReduce Tutorial : Running multiple Hadoop jobs in one source file

The Java API offers possibility to submit multiple Hadoop job in one source file. A job can be submitted either using

	*  job.waitForCompletion -- the job is submitted and the method waits for it to finish (successfully or not).</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-28?rev=1328465421&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-02-05T19:10:21+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-28</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-28?rev=1328465421&amp;do=diff</link>
        <description>MapReduce Tutorial : Custom data types

An important feature of the Java API is that custom data and format types can be provided. In this step we implement two custom data types.

BERIntWritable

We want to implement BERIntWritable, which is an int stored in the format of</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-29?rev=1328465656&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-02-05T19:14:16+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-29</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-29?rev=1328465656&amp;do=diff</link>
        <description>MapReduce Tutorial : Custom sorting and grouping comparators.

Custom sorting comparator

The keys are sorted before processed by a reducer, using a
Raw comparator. The default comparator uses the compareTo method provided by the key type, which is a subclass of WritableComparable. Consider for example the following</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-30?rev=1328010077&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-01-31T12:41:17+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-30</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-30?rev=1328010077&amp;do=diff</link>
        <description>MapReduce Tutorial : Custom input formats

Every custom format reading keys of type K and values of type V must subclass InputFormat&lt;K, V&gt;. Usually it is easier to subclass FileInputFormat&lt;K, V&gt; -- the file listing and splitting is then solved by the FileInputFormat itself.

FileAsPathInputFormat</description>
    </item>
    <item rdf:about="https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-31?rev=1328536369&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2012-02-06T14:52:49+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>courses:mapreduce-tutorial:step-31</title>
        <link>https://wiki.ufal.ms.mff.cuni.cz/courses:mapreduce-tutorial:step-31?rev=1328536369&amp;do=diff</link>
        <description>MapReduce Tutorial: Implementing iterative MapReduce jobs faster using All-Reduce

Implementing an iterative computation by running a separate Hadoop job for every iteration is usually not very efficient (although it is fault tolerant).

If we have enough machines that all input data fits into memory, we can implement iterative computation like this:</description>
    </item>
</rdf:RDF>
