Related Topics
No topics are associated with this blog
package com.georgefisher.tfidf;
import java.io.IOException;
import java.util.HashMap;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
//MapReduce TF-IDF
//
//step 3
//------
//map in (word@doc, n;N)
// out (word, doc;n;N;1)
//
//reduce in (word, [doc;n;N;count, doc;n;N;count, ...])
// out (word@doc, n;N;df)
public class TFIDF_step3 {
// ============ MAPPER ===============
public static class Map
extends Mapper<Text, Text, Text, Text> {
private final static Text word = new Text();
private static Text doc_n_N_1 = new Text();
//map in (word@doc, n;N)
// out (word, doc;n;N;1)
@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
String[] word_doc = key.toString().split("@");
word.set(word_doc[0]);
String doc = word_doc[1];
String[] n_N = value.toString().split(";");
String n = n_N[0];
String N = n_N[1];
doc_n_N_1.set(doc +";"+ n +";"+ N +";"+ 1);
context.write(word, doc_n_N_1);
}
}
// ============ REDUCER ===============
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
private static Text word_doc = new Text();
private static Text n_N_df = new Text();
//reduce in (word, [doc;n;N;count, doc;n;N;count, ...])
// out (word@doc, n;N;df)
@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// NOTE: at scale, memory may be an issue;
// possibly forcing the use of intermediate disk storage
HashMap<String, Integer> word_docList = new HashMap<String, Integer>();
HashMap<String, Integer> docList = new HashMap<String, Integer>();
HashMap<String, Integer> wordList = new HashMap<String, Integer>();
word_docList.clear();
docList.clear();
wordList.clear();
String word = key.toString();
for (Text value: values) {
// doc;n;N;count
String[] doc_n_N_count = value.toString().split(";");
String doc = doc_n_N_count[0];
Integer n = Integer.parseInt(doc_n_N_count[1]);
Integer N = Integer.parseInt(doc_n_N_count[2]);
Integer count = Integer.parseInt(doc_n_N_count[3]);
// save (doc, N), the total number of words in doc
if (!docList.containsKey(doc)) {
docList.put(doc,N);
} else {
if (N != docList.get(doc)) {
System.out.println("N != docList.get(doc)"+
": N="+N+
"; doc="+doc+
"; docList.get(doc)="+docList.get(doc));
System.exit(-1);
}
}
// save (word@doc, n), the frequency of word in doc
if (!word_docList.containsKey(word+"@"+doc)) {
word_docList.put(word+"@"+doc,n);
} else {
if (n != word_docList.get(word+"@"+doc)) {
System.out.println("n != word_docList.get(word+\"@\"+doc)"+
": n="+n+
"; (word+\"@\"+doc="+word+"@"+doc+
"; word_docList="+word_docList.get(word+"@"+doc));
System.exit(-1);
}
}
// increment df, the frequency of word in the dataset
if (!wordList.containsKey(word)) {
wordList.put(word, count);
} else {
Integer df = wordList.get(word);
df += count;
wordList.put(word, df);
}
}
// compose the (word@doc, n;N;df) output
for (String WordDoc: word_docList.keySet()) {
String[] Word_Doc = WordDoc.split("@");
String Word = Word_Doc[0];
String Doc = Word_Doc[1];
Integer little_en = word_docList.get(WordDoc);
Integer big_en = docList.get(Doc);
Integer df = wordList.get(Word);
word_doc.set(WordDoc);
n_N_df.set(little_en+";"+big_en+";"+df);
context.write(word_doc, n_N_df);
}
}
}
}
TF-IDF Step4 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019