Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
courses:mapreduce-tutorial:step-28 [2012/01/31 12:40] straka |
courses:mapreduce-tutorial:step-28 [2012/02/05 19:10] (current) straka |
||
---|---|---|---|
Line 1: | Line 1: | ||
- | ====== MapReduce Tutorial : Running multiple Hadoop jobs in one class ====== | + | ====== MapReduce Tutorial : Custom data types ====== |
- | The Java API offers possibility | + | An important feature of the Java API is that custom data and format types can be provided. In this step we implement two custom data types. |
- | | + | |
- | * [[http:// | + | ===== BERIntWritable ===== |
+ | |||
+ | We want to implement BERIntWritable, | ||
+ | |||
+ | The new class must implement the [[http:// | ||
+ | |||
+ | <code java> | ||
+ | public class BERIntWritable implements Writable { | ||
+ | private int value; | ||
+ | |||
+ | public void readFields(DataInput in) throws IOException { | ||
+ | value = 0; | ||
+ | |||
+ | byte next; | ||
+ | while (((next = in.readByte()) & 0x80) != 0) { | ||
+ | value = (value << 7) | (next & 0x7F); | ||
+ | } | ||
+ | value = (value << 7) | next; | ||
+ | } | ||
+ | |||
+ | public void write(DataOutput out) throws IOException { | ||
+ | int mask_shift = 28; | ||
+ | while (mask_shift > 0 && (value & (0x7F << mask_shift)) == 0) mask_shift -= 7; | ||
+ | while (mask_shift > 0) { | ||
+ | out.writeByte(0x80 | ((value >> mask_shift) & 0x7F)); | ||
+ | mask_shift | ||
+ | } | ||
+ | out.writeByte(value & 0x7F); | ||
+ | } | ||
+ | </ | ||
+ | Accessory methods '' | ||
+ | <code java> | ||
+ | public int get() { return value; } | ||
+ | public void set(int value) { this.value = value; } | ||
+ | | ||
+ | } | ||
+ | </ | ||
+ | Remark: If the '' | ||
+ | |||
+ | Such implementation can be used as a type of //values//. If we wanted to use it as a type of //keys//, we need to implement | ||
+ | <code java> | ||
+ | public class BERIntWritable implements WritableComparable { | ||
+ | ... //Same as before | ||
+ | |||
+ | public int compareTo(Object other) { | ||
+ | int otherValue = ((BERIntWritable)other).get(); | ||
+ | return value < otherValue ? -1 : (value == otherValue ? 0 : 1); | ||
+ | } | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | ===== PairWritable< | ||
+ | |||
+ | As another example, we implement a type consisting of two user-defined '' | ||
+ | <code java> | ||
+ | public static class PairWritable< | ||
+ | private A first; | ||
+ | private B second; | ||
+ | |||
+ | public void readFields(DataInput in) throws IOException { | ||
+ | first.readFields(in); | ||
+ | second.readFields(in); | ||
+ | } | ||
+ | |||
+ | public void write(DataOutput out) throws IOException { | ||
+ | first.write(out); | ||
+ | second.write(out); | ||
+ | } | ||
+ | |||
+ | public A getFirst() { return first; } | ||
+ | public B getSecond() { return second; } | ||
+ | public void setFirst(A first) { this.first = first; } | ||
+ | public void setSecond(B first) { this.second = second; } | ||
+ | public String toString() { return String.format(" | ||
+ | public PairWritable(A first, B second) { this.first = first; this.second = second; } | ||
+ | } | ||
+ | </ | ||
+ | Remark: Remark: If the '' | ||
+ | |||
+ | We did not define '' | ||
+ | <code java> | ||
+ | public static class PairWritableComparable< | ||
+ | private A first; | ||
+ | private B second; | ||
+ | |||
+ | public void readFields(DataInput in) throws IOException { | ||
+ | first.readFields(in); | ||
+ | second.readFields(in); | ||
+ | } | ||
+ | |||
+ | public void write(DataOutput out) throws IOException { | ||
+ | first.write(out); | ||
+ | second.write(out); | ||
+ | } | ||
+ | |||
+ | public int compareTo(Object other) { | ||
+ | PairWritableComparable< | ||
+ | int cmpFirst = first.compareTo(otherPair.getFirst()); | ||
+ | if (cmpFirst < 0) return -1; | ||
+ | if (cmpFirst > 0) return 1; | ||
+ | return second.compareTo(otherPair.getSecond()); | ||
+ | } | ||
+ | |||
+ | public A getFirst() { return first; } | ||
+ | public B getSecond() { return second; } | ||
+ | public void setFirst(A first) { this.first = first; } | ||
+ | public void setSecond(B first) { this.second = second; } | ||
+ | public String toString() { return String.format(" | ||
+ | public PairWritableComparable(A first, B second) { this.first = first; this.second = second; } | ||
+ | } | ||
+ | </ | ||
+ | Remark: If the '' | ||
===== Exercise 1 ===== | ===== Exercise 1 ===== | ||
- | Improve the last [[.: | + | Imagine you want to create an inverted index. |
- | - in the first job, create a list of unique document names. Number | + | |
- | - in the second job, create | + | |
- | ===== Exercise 2 ===== | + | Create a type '' |
+ | * stores a document of type '' | ||
+ | * stores a list of positions of occurrence. The sequence of length //N// should be stored on disk as number //N// followed by //N// numbers -- positions of occurrence. Type '' | ||
+ | * is comparable, comparing using the '' | ||
+ | * has methods '' | ||
- | Implement the [[.:step-15|K-means clustering exercise]] in Java. Instead | + | Using this type, create an inverted index -- implement a Hadoop job, that for each word creates a list of '' |
+ | ===== Exercise 2 ===== | ||
+ | |||
+ | Optional. Improve the solution to identify the documents by their ids instead of names, i.e., create for each word a sequence of '' | ||
+ | - in the first job, create a list of unique document names. Number the documents using the order in this list. | ||
+ | - in the second job, create for each word a list of '' | ||
---- | ---- | ||
Line 21: | Line 139: | ||
<table style=" | <table style=" | ||
<tr> | <tr> | ||
- | <td style=" | + | <td style=" |
<td style=" | <td style=" | ||
<td style=" | <td style=" |