Philadelphia Reflections

The musings of a physician who has served the community for over six decades

Related Topics

No topics are associated with this blog

TF-IDF Step 4

<- TF-IDF Step3
package com.georgefisher.tfidf;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Mapper;
//MapReduce TF-IDF
//step 4
//map       in  (word@doc, n;N;df)
//          out (word@doc, [tf, idf, tfidf])
//reduce    none, map-only
public class TFIDF_step4 {
    // ============ MAPPER ===============
    public static class Map
    extends Mapper<Text, Text, Text, Text> {
        private static int fileCount;
        public void setup(Context context) throws IOException {
            // get the number of documents in the original input folder
            Configuration  conf = context.getConfiguration();
            FileSystem     fs   = FileSystem.get(conf);
            Path           pt   = new Path(conf.get("originalInputDir"));
            ContentSummary cs   = fs.getContentSummary(pt);
            fileCount           = (int)cs.getFileCount();
        private final static Text word_doc = new Text();
        private static       Text result   = new Text();
        //map       in  (word@doc, n;N;df)
        //          out (word@doc, [tf, idf, tfidf])
        public void map(Text key, Text value, Context context)
                throws IOException, InterruptedException {
            // NOTE: since this step is map-only, its logic could easily be
            //       incorporated into a loop at the end of the previous
            //       step if the cost of job setup and teardown exceeds
            //       the desire for clear code
            String[] n_N_m = value.toString().split(";");
            Integer n = Integer.parseInt(n_N_m[0]);
          //Integer N = Integer.parseInt(n_N_m[1]);
            Integer df = Integer.parseInt(n_N_m[2]);
            double tf    = (double)n;
            double idf   = Math.log((double)fileCount / (double)df);
            double tfidf = tf * idf;
            result.set("[tf="+n +
                        " idf=log(fileCount/df)=log("+fileCount+"/"+df+")="+idf+
                        " tfidf=tf*idf="+tfidf+"]");
            context.write(word_doc, result);
TF-IDF Step5 ->

Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019