Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
courses:mapreduce-tutorial:step-28 [2012/01/29 16:34] straka |
courses:mapreduce-tutorial:step-28 [2012/02/05 19:10] (current) straka |
||
---|---|---|---|
Line 1: | Line 1: | ||
- | ====== MapReduce Tutorial : Running multiple Hadoop jobs in one class ====== | + | ====== MapReduce Tutorial : Custom data types ====== |
- | The Java API offers possibility to submit multiple Hadoop job in one class. A job can be submitted either using | + | An important feature of the Java API is that custom data and format types can be provided. In this step we implement two custom data types. |
- | * [[http:// | + | |
- | * [[http:// | + | |
+ | ===== BERIntWritable ===== | ||
+ | |||
+ | We want to implement BERIntWritable, | ||
+ | |||
+ | The new class must implement the [[http:// | ||
+ | |||
+ | <code java> | ||
+ | public class BERIntWritable implements Writable { | ||
+ | private int value; | ||
+ | |||
+ | public void readFields(DataInput in) throws IOException { | ||
+ | value = 0; | ||
+ | |||
+ | byte next; | ||
+ | while (((next = in.readByte()) & 0x80) != 0) { | ||
+ | value = (value << 7) | (next & 0x7F); | ||
+ | } | ||
+ | value = (value << 7) | next; | ||
+ | } | ||
+ | |||
+ | public void write(DataOutput out) throws IOException { | ||
+ | int mask_shift = 28; | ||
+ | while (mask_shift > 0 && (value & (0x7F << mask_shift)) == 0) mask_shift -= 7; | ||
+ | while (mask_shift > 0) { | ||
+ | out.writeByte(0x80 | ((value >> mask_shift) & 0x7F)); | ||
+ | mask_shift -= 7; | ||
+ | } | ||
+ | out.writeByte(value & 0x7F); | ||
+ | } | ||
+ | </ | ||
+ | Accessory methods '' | ||
+ | <code java> | ||
+ | public int get() { return value; } | ||
+ | public void set(int value) { this.value = value; } | ||
+ | public String toString() { return String.valueOf(value); | ||
+ | } | ||
+ | </ | ||
+ | Remark: If the '' | ||
+ | |||
+ | Such implementation can be used as a type of //values//. If we wanted to use it as a type of //keys//, we need to implement [[http:// | ||
+ | <code java> | ||
+ | public class BERIntWritable implements WritableComparable { | ||
+ | ... //Same as before | ||
+ | |||
+ | public int compareTo(Object other) { | ||
+ | int otherValue = ((BERIntWritable)other).get(); | ||
+ | return value < otherValue ? -1 : (value == otherValue ? 0 : 1); | ||
+ | } | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | ===== PairWritable< | ||
+ | |||
+ | As another example, we implement a type consisting of two user-defined '' | ||
+ | <code java> | ||
+ | public static class PairWritable< | ||
+ | private A first; | ||
+ | private B second; | ||
+ | |||
+ | public void readFields(DataInput in) throws IOException { | ||
+ | first.readFields(in); | ||
+ | second.readFields(in); | ||
+ | } | ||
+ | |||
+ | public void write(DataOutput out) throws IOException { | ||
+ | first.write(out); | ||
+ | second.write(out); | ||
+ | } | ||
+ | |||
+ | public A getFirst() { return first; } | ||
+ | public B getSecond() { return second; } | ||
+ | public void setFirst(A first) { this.first = first; } | ||
+ | public void setSecond(B first) { this.second = second; } | ||
+ | public String toString() { return String.format(" | ||
+ | public PairWritable(A first, B second) { this.first = first; this.second = second; } | ||
+ | } | ||
+ | </ | ||
+ | Remark: Remark: If the '' | ||
+ | |||
+ | We did not define '' | ||
+ | <code java> | ||
+ | public static class PairWritableComparable< | ||
+ | private A first; | ||
+ | private B second; | ||
+ | |||
+ | public void readFields(DataInput in) throws IOException { | ||
+ | first.readFields(in); | ||
+ | second.readFields(in); | ||
+ | } | ||
+ | |||
+ | public void write(DataOutput out) throws IOException { | ||
+ | first.write(out); | ||
+ | second.write(out); | ||
+ | } | ||
+ | |||
+ | public int compareTo(Object other) { | ||
+ | PairWritableComparable< | ||
+ | int cmpFirst = first.compareTo(otherPair.getFirst()); | ||
+ | if (cmpFirst < 0) return -1; | ||
+ | if (cmpFirst > 0) return 1; | ||
+ | return second.compareTo(otherPair.getSecond()); | ||
+ | } | ||
+ | |||
+ | public A getFirst() { return first; } | ||
+ | public B getSecond() { return second; } | ||
+ | public void setFirst(A first) { this.first = first; } | ||
+ | public void setSecond(B first) { this.second = second; } | ||
+ | public String toString() { return String.format(" | ||
+ | public PairWritableComparable(A first, B second) { this.first = first; this.second = second; } | ||
+ | } | ||
+ | </ | ||
+ | Remark: If the '' | ||
+ | |||
+ | ===== Exercise 1 ===== | ||
+ | |||
+ | Imagine you want to create an inverted index. In the index, for each word and document containing the word, all positions of the word in the document have to be stored. | ||
+ | |||
+ | Create a type '' | ||
+ | * stores a document of type '' | ||
+ | * stores a list of positions of occurrence. The sequence of length //N// should be stored on disk as number //N// followed by //N// numbers -- positions of occurrence. Type '' | ||
+ | * is comparable, comparing using the '' | ||
+ | * has methods '' | ||
+ | |||
+ | Using this type, create an inverted index -- implement a Hadoop job, that for each word creates a list of '' | ||
+ | |||
+ | ===== Exercise 2 ===== | ||
+ | |||
+ | Optional. Improve the solution to identify the documents by their ids instead of names, i.e., create for each word a sequence of '' | ||
+ | - in the first job, create a list of unique document names. Number the documents using the order in this list. | ||
+ | - in the second job, create for each word a list of '' | ||
+ | |||
+ | ---- | ||
+ | |||
+ | < | ||
+ | <table style=" | ||
+ | <tr> | ||
+ | <td style=" | ||
+ | <td style=" | ||
+ | <td style=" | ||
+ | </tr> | ||
+ | </ | ||
+ | </ | ||