Related Topics
No topics are associated with this blog
package com.georgefisher.tfidf; import java.io.IOException; import java.util.HashMap; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; //MapReduce TF-IDF // //step 2 //------ //map in (word@doc, n) // out (doc, word;n) // //reduce in (doc, [word;n, word;n, ...]) // out (word@doc, n;N) public class TFIDF_step2 { // ============ MAPPER =============== public static class Map extends Mapper<Text, Text, Text, Text> { private final static Text doc = new Text(); private static Text word_n = new Text(); //map in (word@doc, n) // out (doc, word;n) @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String[] word_doc = key.toString().split("@"); String word = word_doc[0]; doc.set(word_doc[1]); String n = value.toString(); word_n.set(word+";"+n); context.write(doc, word_n); } } // ============ REDUCER =============== public static class Reduce extends Reducer<Text, Text, Text, Text> { private static Text word_doc = new Text(); private static Text n_N = new Text(); //reduce in (doc, [word;n, word;n, ...]) // out (word@doc, n;N) @Override public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String doc = key.toString(); // NOTE: at scale, memory may be an issue; // possibly forcing the use of intermediate disk storage int N = 0; HashMap<String, Integer> wordList = new HashMap<String, Integer>(); wordList.clear(); for (Text word_n: values) { String[] bits = word_n.toString().split(";"); String word = bits[0]; int n = Integer.parseInt(bits[1]); wordList.put(word, n); N += n; } for (String word: wordList.keySet()) { word_doc.set(word+"@"+doc); n_N.set(wordList.get(word)+";"+N); context.write(word_doc, n_N); } } } }TF-IDF Step3 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019