Related Topics
No topics are associated with this blog
package com.georgefisher.tfidf;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
//MapReduce TF-IDF
//
// step 5 find the word in each document with the highest TF-IDF
// ------ =====================================================
//
// map in (word@doc, [tf, idf, tfidf])
// out (doc, word;tfidf)
//
// reduce in (doc, [word;tfidf, word;tfidf, ...])
// out (word@doc, max_tfidf)
public class TFIDF_step5 {
// ============ MAPPER ===============
public static class Map
extends Mapper<Text, Text, Text, Text> {
private static Pattern regex;
public void setup(Context context) throws PatternSyntaxException {
regex = Pattern.compile("tfidf=tf\\*idf=([-]?\\d+\\.\\d+)", Pattern.MULTILINE);
}
private static Text doc = new Text();
private static Text word_tfidf = new Text();
// map in (word@doc, [tf, idf, tfidf])
// out (doc, word;tfidf)
@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
String[] word_doc = key.toString().split("@");
String word = word_doc[0];
doc.set(word_doc[1]);
String inputValue = value.toString();
String tfidf = null;
Matcher regexMatcher = regex.matcher(inputValue);
if (regexMatcher.find()) {
tfidf = regexMatcher.group(1);
}
word_tfidf.set(word+";"+tfidf);
context.write(doc, word_tfidf);
}
}
// ============ REDUCER ===============
public static class Reduce
extends Reducer<Text, Text, Text, DoubleWritable> {
private static Text word_doc = new Text();
private static DoubleWritable max_tfidf = new DoubleWritable();
// reduce in (doc, [word;tfidf, word;tfidf, ...])
// out (word@doc, max_tfidf)
@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String doc = key.toString();
Double max = Double.MIN_VALUE;
for (Text value: values) {
String[] word_tfidf = value.toString().split(";");
String word = word_tfidf[0];
Double tfidf = Double.parseDouble(word_tfidf[1]);
if (tfidf > max) {
max = tfidf;
max_tfidf.set(tfidf);
word_doc.set(word+"@"+doc);
}
}
context.write(word_doc, max_tfidf);
}
}
}
TF-IDF Result ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019