Philadelphia Reflections: TF-IDF Step 3

TF-IDF Step 3

package com.georgefisher.tfidf;
 
import java.io.IOException;
import java.util.HashMap;
 
import org.apache.hadoop.io.Text;
 
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
 
//MapReduce TF-IDF
//
//step 3
//------
//map       in  (word@doc, n;N)
//          out (word, doc;n;N;1)
//
//reduce    in  (word, [doc;n;N;count, doc;n;N;count, ...])
//          out (word@doc, n;N;df)
 
public class TFIDF_step3 {
    // ============ MAPPER ===============
    public static class Map
    extends Mapper<Text, Text, Text, Text> {
 
        private final static Text word      = new Text();
        private static       Text doc_n_N_1 = new Text();
 
        //map       in  (word@doc, n;N)
        //          out (word, doc;n;N;1)
        @Override
        public void map(Text key, Text value, Context context)
                throws IOException, InterruptedException {
 
            String[] word_doc = key.toString().split("@");
            word.set(word_doc[0]);
            String doc = word_doc[1];
 
            String[] n_N = value.toString().split(";");
            String n = n_N[0];
            String N = n_N[1];
 
            doc_n_N_1.set(doc +";"+ n +";"+ N +";"+ 1);
 
            context.write(word, doc_n_N_1);
        }
    }
 
    // ============ REDUCER ===============
    public static class Reduce
    extends Reducer<Text, Text, Text, Text> {
 
        private static Text word_doc = new Text();
        private static Text n_N_df   = new Text();
 
        //reduce    in  (word, [doc;n;N;count, doc;n;N;count, ...])
        //          out (word@doc, n;N;df)
        @Override
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
 
            // NOTE: at scale, memory may be an issue;
            //       possibly forcing the use of intermediate disk storage
            HashMap<String, Integer> word_docList = new HashMap<String, Integer>();
            HashMap<String, Integer> docList      = new HashMap<String, Integer>();
            HashMap<String, Integer> wordList     = new HashMap<String, Integer>();
            word_docList.clear();
            docList.clear();
            wordList.clear();
 
            String word = key.toString();
 
            for (Text value: values) {
                // doc;n;N;count
                String[] doc_n_N_count = value.toString().split(";");
                String  doc    = doc_n_N_count[0];
                Integer n      = Integer.parseInt(doc_n_N_count[1]);
                Integer N      = Integer.parseInt(doc_n_N_count[2]);
                Integer count  = Integer.parseInt(doc_n_N_count[3]);
 
                // save (doc, N), the total number of words in doc
                if (!docList.containsKey(doc)) {
                    docList.put(doc,N);
                } else {
                    if (N != docList.get(doc)) {
                        System.out.println("N != docList.get(doc)"+
                                ": N="+N+
                                "; doc="+doc+
                                "; docList.get(doc)="+docList.get(doc));
                        System.exit(-1);
                    }
                }
                 
                // save (word@doc, n), the frequency of word in doc
                if (!word_docList.containsKey(word+"@"+doc)) {
                    word_docList.put(word+"@"+doc,n);
                } else {
                    if (n != word_docList.get(word+"@"+doc)) {
                        System.out.println("n != word_docList.get(word+\"@\"+doc)"+
                                ": n="+n+
                                "; (word+\"@\"+doc="+word+"@"+doc+
                                "; word_docList="+word_docList.get(word+"@"+doc));
                        System.exit(-1);
                    }
                }
                 
                // increment df, the frequency of word in the dataset
                if (!wordList.containsKey(word)) {
                    wordList.put(word, count);
                } else {
                    Integer df = wordList.get(word);
                    df += count;
                    wordList.put(word, df);
                }
            }
 
            // compose the (word@doc, n;N;df) output
            for (String WordDoc: word_docList.keySet()) {
                String[] Word_Doc = WordDoc.split("@");
                String Word       = Word_Doc[0];
                String Doc        = Word_Doc[1];
                Integer little_en = word_docList.get(WordDoc);
                Integer big_en    = docList.get(Doc);
                Integer df        = wordList.get(Word);
 
                word_doc.set(WordDoc);
                n_N_df.set(little_en+";"+big_en+";"+df);
                context.write(word_doc,  n_N_df);
            }
        }
    }
}
TF-IDF Step4 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019