Philadelphia Reflections: TF-IDF Step 2

TF-IDF Step 2

package com.georgefisher.tfidf;
 
import java.io.IOException;
import java.util.HashMap;
 
import org.apache.hadoop.io.Text;
 
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
 
//MapReduce TF-IDF
//
//step 2
//------
//map       in  (word@doc, n)
//          out (doc, word;n)
//
//reduce    in  (doc, [word;n, word;n, ...])
//          out (word@doc, n;N)
 
public class TFIDF_step2 {
    // ============ MAPPER ===============
    public static class Map
    extends Mapper<Text, Text, Text, Text> {
 
        private final static Text doc    = new Text();
        private static       Text word_n = new Text();
 
        //map       in  (word@doc, n)
        //          out (doc, word;n)
        @Override
        public void map(Text key, Text value, Context context)
                throws IOException, InterruptedException {
 
            String[] word_doc = key.toString().split("@");
            String word       = word_doc[0];
            doc.set(word_doc[1]);
             
            String n = value.toString();
            word_n.set(word+";"+n);
 
            context.write(doc, word_n);
        }
    }
 
    // ============ REDUCER ===============
    public static class Reduce
    extends Reducer<Text, Text, Text, Text> {
 
        private static Text word_doc = new Text();
        private static Text n_N       = new Text();
 
        //reduce    in  (doc, [word;n, word;n, ...])
        //          out (word@doc, n;N)
        @Override
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
             
            String doc = key.toString();
             
            // NOTE: at scale, memory may be an issue;
            //       possibly forcing the use of intermediate disk storage
            int N = 0;
            HashMap<String, Integer> wordList = new HashMap<String, Integer>();
            wordList.clear();
            for (Text word_n: values) {
                String[] bits = word_n.toString().split(";");
                String word = bits[0];
                int n       = Integer.parseInt(bits[1]);
                wordList.put(word, n);
                N += n;
            }
 
            for (String word: wordList.keySet()) {
                word_doc.set(word+"@"+doc);
                n_N.set(wordList.get(word)+";"+N);
                context.write(word_doc,  n_N);
            }
        }
    }
}
TF-IDF Step3 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019