Philadelphia Reflections

The musings of a physician who has served the community for over six decades

Related Topics

No topics are associated with this blog

TF-IDF Step1

<- TF-IDF Driver
package com.georgefisher.tfidf;
 
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
 
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
 
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 
//MapReduce TF-IDF
//
//step 1
//------
//map       in  (byteCount, line)
//          out (word@doc, 1)
//
//reduce    in  (word@doc, [count, count, ...])
//          out (word@doc, n)
 
public class TFIDF_step1 { 
 
    // ============ MAPPER ===============
    public static class Map
    extends Mapper<LongWritable, Text, Text, IntWritable> {
 
        private final static IntWritable ONE = new IntWritable(1);
        private static       Text   word_doc = new Text();
 
        // NOTE: stemming might also be considered
         
        // NOTE: at scale, memory may be an issue;
        //       possibly forcing the inclusion of certain high-frequency
        //       words that are not technically stop words.
        private static Set<String> stopWords;
        static {
            // we stop ALL words less than 3 in length
            stopWords = new HashSet<String>();
            stopWords.add("about"); stopWords.add("and");
            stopWords.add("are");   stopWords.add("com");
            stopWords.add("for");   stopWords.add("from");
            stopWords.add("how");
            stopWords.add("that");  stopWords.add("the");
            stopWords.add("this");  stopWords.add("was");
            stopWords.add("what");  stopWords.add("when");
            stopWords.add("where"); stopWords.add("with");
            stopWords.add("who");   stopWords.add("will");
            stopWords.add("the");   stopWords.add("www");
             
            stopWords.add("verse"); // for shakespeare's sonnets
        }
 
        //map       in  (byteCount, line)
        //          out (word@doc, 1)
        @Override
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
 
            String line    = value.toString().toLowerCase();
            String[] words = line.split("\\W+");
 
            // skip stop words and junk
            for (int i=0; i < words.length; i++) {
                String word = words[i];
                if (word.length() < 3                   || // note
                    !Character.isLetter(word.charAt(0)) ||
                    Character.isDigit(word.charAt(0))   ||
                    stopWords.contains(word)            ||
                    word.contains("_")) {
                    continue;
                }
 
                String fileName =
                        ((FileSplit) context.getInputSplit()).getPath().getName();
 
                word_doc.set(word+"@"+fileName);
                context.write(word_doc, ONE);
            }
        }
    }
 
    // ============ REDUCER ===============
    public static class Reduce
    extends Reducer<Text, IntWritable, Text, IntWritable> {
 
        private static IntWritable n = new IntWritable();
 
        //reduce    in  (word@doc, [count, count, ...])
        //          out (word@doc, n)
        @Override
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
 
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            n.set(sum);
            context.write(key, n);
        }
    }
}
TF-IDF Step2 ->

Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019