Philadelphia Reflections: TF-IDF Step 4

TF-IDF Step 4

package com.georgefisher.tfidf;
 
import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
 
import org.apache.hadoop.mapreduce.Mapper;
 
//MapReduce TF-IDF
//
//step 4
//------
//map       in  (word@doc, n;N;df)
//          out (word@doc, [tf, idf, tfidf])
//reduce    none, map-only
 
public class TFIDF_step4 {
    // ============ MAPPER ===============
    public static class Map
    extends Mapper<Text, Text, Text, Text> {
         
        private static int fileCount;
         
        public void setup(Context context) throws IOException {
            // get the number of documents in the original input folder
            Configuration  conf = context.getConfiguration();
            FileSystem     fs   = FileSystem.get(conf);
            Path           pt   = new Path(conf.get("originalInputDir"));
            ContentSummary cs   = fs.getContentSummary(pt);
            fileCount           = (int)cs.getFileCount();
        }
 
        private final static Text word_doc = new Text();
        private static       Text result   = new Text();
 
        //map       in  (word@doc, n;N;df)
        //          out (word@doc, [tf, idf, tfidf])
        @Override
        public void map(Text key, Text value, Context context)
                throws IOException, InterruptedException {
 
            // NOTE: since this step is map-only, its logic could easily be
            //       incorporated into a loop at the end of the previous
            //       step if the cost of job setup and teardown exceeds
            //       the desire for clear code
            String[] n_N_m = value.toString().split(";");
            Integer n = Integer.parseInt(n_N_m[0]);
          //Integer N = Integer.parseInt(n_N_m[1]);
            Integer df = Integer.parseInt(n_N_m[2]);
             
            double tf    = (double)n;
            double idf   = Math.log((double)fileCount / (double)df);
            double tfidf = tf * idf;
             
            word_doc.set(key);
            result.set("[tf="+n +
                        " idf=log(fileCount/df)=log("+fileCount+"/"+df+")="+idf+
                        " tfidf=tf*idf="+tfidf+"]");
 
            context.write(word_doc, result);
        }
    }
}
TF-IDF Step5 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019