[ Skip to the content ]

Institute of Formal and Applied Linguistics Wiki


[ Back to the navigation ]

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision Both sides next revision
courses:mapreduce-tutorial:step-29 [2012/01/30 00:40]
straka
courses:mapreduce-tutorial:step-29 [2012/01/30 00:44]
straka
Line 14: Line 14:
 <code java> <code java>
 public class FileAsPathInputFormat extends FileInputFormat<Text, Text> { public class FileAsPathInputFormat extends FileInputFormat<Text, Text> {
 +  // Helper class, which does the actual work -- produce the (path, offset-length) input pair.
   public static class FileAsPathRecordReader extends RecordReader<Text, Text> {   public static class FileAsPathRecordReader extends RecordReader<Text, Text> {
     private Path file;     private Path file;
Line 42: Line 43:
   }   }
              
 +  // Use the helper class as a RecordReader in out file format.
   public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {   public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
     return new FileAsPathRecordReader();     return new FileAsPathRecordReader();
   }      }   
              
 +  // Allow splitting uncompressed files.
   protected boolean isSplittable(JobContext context, Path filename) {   protected boolean isSplittable(JobContext context, Path filename) {
     CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);     CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
Line 55: Line 58:
 ===== WholeFileInputFormat ===== ===== WholeFileInputFormat =====
  
-We start by creating ''WholeFileInputFormat'', which reads any file and return exactly one input pair (input_path, file_content) with types (''Text'', ''BytesWritable''). The format does not allow file splitting -- each file will be processed by exactly one mapper+Next we create ''WhileFileInputFormat'', which for each file return exactly one input pair (input_path, file_content) with types (''Text'', ''BytesWritable''). The format does not allow file splitting -- each file will be processed by exactly one mapper.
- +
-The main functionality lays in ''WholeFileRecordReader'', a subclass of [[http://hadoop.apache.org/common/docs/r1.0.0/api/org/apache/hadoop/mapreduce/RecordReader.html|RecordReader<Text, BytesWritable]].+
  
 <code java> <code java>
Line 65: Line 66:
     private Path file;     private Path file;
     int length;     int length;
-    private boolean value_read; 
     private Text key;     private Text key;
     private BytesWritable value;     private BytesWritable value;
Line 76: Line 76:
       key = null;       key = null;
       value = null;       value = null;
-      value_read = false; 
  
       FileSystem fs = file.getFileSystem(context.getConfiguration());       FileSystem fs = file.getFileSystem(context.getConfiguration());
Line 88: Line 87:
  
     public boolean nextKeyValue() throws IOException {     public boolean nextKeyValue() throws IOException {
-      if (value_read) return false;+      if (key != null) return false;
  
       byte[] data = new byte[length];       byte[] data = new byte[length];
Line 95: Line 94:
       key = new Text(file.toString());       key = new Text(file.toString());
       value = new BytesWritable(data);       value = new BytesWritable(data);
-      value_read = true; 
  
       return true;       return true;
Line 102: Line 100:
     public Text getCurrentKey() { return key; }     public Text getCurrentKey() { return key; }
     public BytesWritable getCurrentValue() { return value; }     public BytesWritable getCurrentValue() { return value; }
-    public float getProgress() { return value_read ? 0 : 1; }+    public float getProgress() { return key == null ? 0 : 1; }
     public synchronized void close() throws IOException { if (in != null) { in.close(); in = null; } }     public synchronized void close() throws IOException { if (in != null) { in.close(); in = null; } }
   }   }
Line 116: Line 114:
   }   }
 } }
- 
 </code> </code>
  
 +===== Exercise: ParagraphTextInputFormat =====

[ Back to the navigation ] [ Back to the content ]