Differences
This shows you the differences between two versions of the page.
| Both sides previous revision Previous revision Next revision | Previous revision | ||
|
courses:mapreduce-tutorial:step-28 [2012/01/31 12:40] straka |
courses:mapreduce-tutorial:step-28 [2012/02/05 19:10] (current) straka |
||
|---|---|---|---|
| Line 1: | Line 1: | ||
| - | ====== MapReduce Tutorial : Running multiple Hadoop jobs in one class ====== | + | ====== MapReduce Tutorial : Custom data types ====== |
| - | The Java API offers possibility | + | An important feature of the Java API is that custom data and format types can be provided. In this step we implement two custom data types. |
| - | | + | |
| - | * [[http:// | + | ===== BERIntWritable ===== |
| + | |||
| + | We want to implement BERIntWritable, | ||
| + | |||
| + | The new class must implement the [[http:// | ||
| + | |||
| + | <code java> | ||
| + | public class BERIntWritable implements Writable { | ||
| + | private int value; | ||
| + | |||
| + | public void readFields(DataInput in) throws IOException { | ||
| + | value = 0; | ||
| + | |||
| + | byte next; | ||
| + | while (((next = in.readByte()) & 0x80) != 0) { | ||
| + | value = (value << 7) | (next & 0x7F); | ||
| + | } | ||
| + | value = (value << 7) | next; | ||
| + | } | ||
| + | |||
| + | public void write(DataOutput out) throws IOException { | ||
| + | int mask_shift = 28; | ||
| + | while (mask_shift > 0 && (value & (0x7F << mask_shift)) == 0) mask_shift -= 7; | ||
| + | while (mask_shift > 0) { | ||
| + | out.writeByte(0x80 | ((value >> mask_shift) & 0x7F)); | ||
| + | mask_shift | ||
| + | } | ||
| + | out.writeByte(value & 0x7F); | ||
| + | } | ||
| + | </ | ||
| + | Accessory methods '' | ||
| + | <code java> | ||
| + | public int get() { return value; } | ||
| + | public void set(int value) { this.value = value; } | ||
| + | | ||
| + | } | ||
| + | </ | ||
| + | Remark: If the '' | ||
| + | |||
| + | Such implementation can be used as a type of //values//. If we wanted to use it as a type of //keys//, we need to implement | ||
| + | <code java> | ||
| + | public class BERIntWritable implements WritableComparable { | ||
| + | ... //Same as before | ||
| + | |||
| + | public int compareTo(Object other) { | ||
| + | int otherValue = ((BERIntWritable)other).get(); | ||
| + | return value < otherValue ? -1 : (value == otherValue ? 0 : 1); | ||
| + | } | ||
| + | } | ||
| + | </ | ||
| + | |||
| + | ===== PairWritable< | ||
| + | |||
| + | As another example, we implement a type consisting of two user-defined '' | ||
| + | <code java> | ||
| + | public static class PairWritable< | ||
| + | private A first; | ||
| + | private B second; | ||
| + | |||
| + | public void readFields(DataInput in) throws IOException { | ||
| + | first.readFields(in); | ||
| + | second.readFields(in); | ||
| + | } | ||
| + | |||
| + | public void write(DataOutput out) throws IOException { | ||
| + | first.write(out); | ||
| + | second.write(out); | ||
| + | } | ||
| + | |||
| + | public A getFirst() { return first; } | ||
| + | public B getSecond() { return second; } | ||
| + | public void setFirst(A first) { this.first = first; } | ||
| + | public void setSecond(B first) { this.second = second; } | ||
| + | public String toString() { return String.format(" | ||
| + | public PairWritable(A first, B second) { this.first = first; this.second = second; } | ||
| + | } | ||
| + | </ | ||
| + | Remark: Remark: If the '' | ||
| + | |||
| + | We did not define '' | ||
| + | <code java> | ||
| + | public static class PairWritableComparable< | ||
| + | private A first; | ||
| + | private B second; | ||
| + | |||
| + | public void readFields(DataInput in) throws IOException { | ||
| + | first.readFields(in); | ||
| + | second.readFields(in); | ||
| + | } | ||
| + | |||
| + | public void write(DataOutput out) throws IOException { | ||
| + | first.write(out); | ||
| + | second.write(out); | ||
| + | } | ||
| + | |||
| + | public int compareTo(Object other) { | ||
| + | PairWritableComparable< | ||
| + | int cmpFirst = first.compareTo(otherPair.getFirst()); | ||
| + | if (cmpFirst < 0) return -1; | ||
| + | if (cmpFirst > 0) return 1; | ||
| + | return second.compareTo(otherPair.getSecond()); | ||
| + | } | ||
| + | |||
| + | public A getFirst() { return first; } | ||
| + | public B getSecond() { return second; } | ||
| + | public void setFirst(A first) { this.first = first; } | ||
| + | public void setSecond(B first) { this.second = second; } | ||
| + | public String toString() { return String.format(" | ||
| + | public PairWritableComparable(A first, B second) { this.first = first; this.second = second; } | ||
| + | } | ||
| + | </ | ||
| + | Remark: If the '' | ||
| ===== Exercise 1 ===== | ===== Exercise 1 ===== | ||
| - | Improve the last [[.: | + | Imagine you want to create an inverted index. |
| - | - in the first job, create a list of unique document names. Number | + | |
| - | - in the second job, create | + | |
| - | ===== Exercise 2 ===== | + | Create a type '' |
| + | * stores a document of type '' | ||
| + | * stores a list of positions of occurrence. The sequence of length //N// should be stored on disk as number //N// followed by //N// numbers -- positions of occurrence. Type '' | ||
| + | * is comparable, comparing using the '' | ||
| + | * has methods '' | ||
| - | Implement the [[.:step-15|K-means clustering exercise]] in Java. Instead | + | Using this type, create an inverted index -- implement a Hadoop job, that for each word creates a list of '' |
| + | ===== Exercise 2 ===== | ||
| + | |||
| + | Optional. Improve the solution to identify the documents by their ids instead of names, i.e., create for each word a sequence of '' | ||
| + | - in the first job, create a list of unique document names. Number the documents using the order in this list. | ||
| + | - in the second job, create for each word a list of '' | ||
| ---- | ---- | ||
| Line 21: | Line 139: | ||
| <table style=" | <table style=" | ||
| <tr> | <tr> | ||
| - | <td style=" | + | <td style=" |
| <td style=" | <td style=" | ||
| <td style=" | <td style=" | ||
