Related Topics
No topics are associated with this blog
package com.georgefisher.tfidf;
import java.io.IOException;
import java.util.HashMap;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
//MapReduce TF-IDF
//
//step 2
//------
//map in (word@doc, n)
// out (doc, word;n)
//
//reduce in (doc, [word;n, word;n, ...])
// out (word@doc, n;N)
public class TFIDF_step2 {
// ============ MAPPER ===============
public static class Map
extends Mapper<Text, Text, Text, Text> {
private final static Text doc = new Text();
private static Text word_n = new Text();
//map in (word@doc, n)
// out (doc, word;n)
@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
String[] word_doc = key.toString().split("@");
String word = word_doc[0];
doc.set(word_doc[1]);
String n = value.toString();
word_n.set(word+";"+n);
context.write(doc, word_n);
}
}
// ============ REDUCER ===============
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
private static Text word_doc = new Text();
private static Text n_N = new Text();
//reduce in (doc, [word;n, word;n, ...])
// out (word@doc, n;N)
@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String doc = key.toString();
// NOTE: at scale, memory may be an issue;
// possibly forcing the use of intermediate disk storage
int N = 0;
HashMap<String, Integer> wordList = new HashMap<String, Integer>();
wordList.clear();
for (Text word_n: values) {
String[] bits = word_n.toString().split(";");
String word = bits[0];
int n = Integer.parseInt(bits[1]);
wordList.put(word, n);
N += n;
}
for (String word: wordList.keySet()) {
word_doc.set(word+"@"+doc);
n_N.set(wordList.get(word)+";"+N);
context.write(word_doc, n_N);
}
}
}
}
TF-IDF Step3 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019