Related Topics
No topics are associated with this blog
package com.georgefisher.tfidf; import java.io.IOException; import java.util.HashMap; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; //MapReduce TF-IDF // //step 3 //------ //map in (word@doc, n;N) // out (word, doc;n;N;1) // //reduce in (word, [doc;n;N;count, doc;n;N;count, ...]) // out (word@doc, n;N;df) public class TFIDF_step3 { // ============ MAPPER =============== public static class Map extends Mapper<Text, Text, Text, Text> { private final static Text word = new Text(); private static Text doc_n_N_1 = new Text(); //map in (word@doc, n;N) // out (word, doc;n;N;1) @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String[] word_doc = key.toString().split("@"); word.set(word_doc[0]); String doc = word_doc[1]; String[] n_N = value.toString().split(";"); String n = n_N[0]; String N = n_N[1]; doc_n_N_1.set(doc +";"+ n +";"+ N +";"+ 1); context.write(word, doc_n_N_1); } } // ============ REDUCER =============== public static class Reduce extends Reducer<Text, Text, Text, Text> { private static Text word_doc = new Text(); private static Text n_N_df = new Text(); //reduce in (word, [doc;n;N;count, doc;n;N;count, ...]) // out (word@doc, n;N;df) @Override public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // NOTE: at scale, memory may be an issue; // possibly forcing the use of intermediate disk storage HashMap<String, Integer> word_docList = new HashMap<String, Integer>(); HashMap<String, Integer> docList = new HashMap<String, Integer>(); HashMap<String, Integer> wordList = new HashMap<String, Integer>(); word_docList.clear(); docList.clear(); wordList.clear(); String word = key.toString(); for (Text value: values) { // doc;n;N;count String[] doc_n_N_count = value.toString().split(";"); String doc = doc_n_N_count[0]; Integer n = Integer.parseInt(doc_n_N_count[1]); Integer N = Integer.parseInt(doc_n_N_count[2]); Integer count = Integer.parseInt(doc_n_N_count[3]); // save (doc, N), the total number of words in doc if (!docList.containsKey(doc)) { docList.put(doc,N); } else { if (N != docList.get(doc)) { System.out.println("N != docList.get(doc)"+ ": N="+N+ "; doc="+doc+ "; docList.get(doc)="+docList.get(doc)); System.exit(-1); } } // save (word@doc, n), the frequency of word in doc if (!word_docList.containsKey(word+"@"+doc)) { word_docList.put(word+"@"+doc,n); } else { if (n != word_docList.get(word+"@"+doc)) { System.out.println("n != word_docList.get(word+\"@\"+doc)"+ ": n="+n+ "; (word+\"@\"+doc="+word+"@"+doc+ "; word_docList="+word_docList.get(word+"@"+doc)); System.exit(-1); } } // increment df, the frequency of word in the dataset if (!wordList.containsKey(word)) { wordList.put(word, count); } else { Integer df = wordList.get(word); df += count; wordList.put(word, df); } } // compose the (word@doc, n;N;df) output for (String WordDoc: word_docList.keySet()) { String[] Word_Doc = WordDoc.split("@"); String Word = Word_Doc[0]; String Doc = Word_Doc[1]; Integer little_en = word_docList.get(WordDoc); Integer big_en = docList.get(Doc); Integer df = wordList.get(Word); word_doc.set(WordDoc); n_N_df.set(little_en+";"+big_en+";"+df); context.write(word_doc, n_N_df); } } } }TF-IDF Step4 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019