Related Topics
No topics are associated with this blog
package com.georgefisher.tfidf;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
//MapReduce TF-IDF
//
//step 1
//------
//map in (byteCount, line)
// out (word@doc, 1)
//
//reduce in (word@doc, [count, count, ...])
// out (word@doc, n)
public class TFIDF_step1 {
// ============ MAPPER ===============
public static class Map
extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable ONE = new IntWritable(1);
private static Text word_doc = new Text();
// NOTE: stemming might also be considered
// NOTE: at scale, memory may be an issue;
// possibly forcing the inclusion of certain high-frequency
// words that are not technically stop words.
private static Set<String> stopWords;
static {
// we stop ALL words less than 3 in length
stopWords = new HashSet<String>();
stopWords.add("about"); stopWords.add("and");
stopWords.add("are"); stopWords.add("com");
stopWords.add("for"); stopWords.add("from");
stopWords.add("how");
stopWords.add("that"); stopWords.add("the");
stopWords.add("this"); stopWords.add("was");
stopWords.add("what"); stopWords.add("when");
stopWords.add("where"); stopWords.add("with");
stopWords.add("who"); stopWords.add("will");
stopWords.add("the"); stopWords.add("www");
stopWords.add("verse"); // for shakespeare's sonnets
}
//map in (byteCount, line)
// out (word@doc, 1)
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString().toLowerCase();
String[] words = line.split("\\W+");
// skip stop words and junk
for (int i=0; i < words.length; i++) {
String word = words[i];
if (word.length() < 3 || // note
!Character.isLetter(word.charAt(0)) ||
Character.isDigit(word.charAt(0)) ||
stopWords.contains(word) ||
word.contains("_")) {
continue;
}
String fileName =
((FileSplit) context.getInputSplit()).getPath().getName();
word_doc.set(word+"@"+fileName);
context.write(word_doc, ONE);
}
}
}
// ============ REDUCER ===============
public static class Reduce
extends Reducer<Text, IntWritable, Text, IntWritable> {
private static IntWritable n = new IntWritable();
//reduce in (word@doc, [count, count, ...])
// out (word@doc, n)
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
n.set(sum);
context.write(key, n);
}
}
}
TF-IDF Step2 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019