Philadelphia Reflections

The musings of a physician who has served the community for over six decades

Related Topics

George and Computers(2)

TF-IDF Step1

<- TF-IDF Driver
package com.georgefisher.tfidf;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
//MapReduce TF-IDF
//step 1
//map       in  (byteCount, line)
//          out (word@doc, 1)
//reduce    in  (word@doc, [count, count, ...])
//          out (word@doc, n)
public class TFIDF_step1 { 
    // ============ MAPPER ===============
    public static class Map
    extends Mapper<LongWritable, Text, Text, IntWritable> {
        private final static IntWritable ONE = new IntWritable(1);
        private static       Text   word_doc = new Text();
        // NOTE: stemming might also be considered
        // NOTE: at scale, memory may be an issue;
        //       possibly forcing the inclusion of certain high-frequency
        //       words that are not technically stop words.
        private static Set<String> stopWords;
        static {
            // we stop ALL words less than 3 in length
            stopWords = new HashSet<String>();
            stopWords.add("about"); stopWords.add("and");
            stopWords.add("are");   stopWords.add("com");
            stopWords.add("for");   stopWords.add("from");
            stopWords.add("that");  stopWords.add("the");
            stopWords.add("this");  stopWords.add("was");
            stopWords.add("what");  stopWords.add("when");
            stopWords.add("where"); stopWords.add("with");
            stopWords.add("who");   stopWords.add("will");
            stopWords.add("the");   stopWords.add("www");
            stopWords.add("verse"); // for shakespeare's sonnets
        //map       in  (byteCount, line)
        //          out (word@doc, 1)
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String line    = value.toString().toLowerCase();
            String[] words = line.split("\\W+");
            // skip stop words and junk
            for (int i=0; i < words.length; i++) {
                String word = words[i];
                if (word.length() < 3                   || // note
                    !Character.isLetter(word.charAt(0)) ||
                    Character.isDigit(word.charAt(0))   ||
                    stopWords.contains(word)            ||
                    word.contains("_")) {
                String fileName =
                        ((FileSplit) context.getInputSplit()).getPath().getName();
                context.write(word_doc, ONE);
    // ============ REDUCER ===============
    public static class Reduce
    extends Reducer<Text, IntWritable, Text, IntWritable> {
        private static IntWritable n = new IntWritable();
        //reduce    in  (word@doc, [count, count, ...])
        //          out (word@doc, n)
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            context.write(key, n);
TF-IDF Step2 ->


Please Let Us Know What You Think


(HTML tags provide better formatting)