Related Topics
No topics are associated with this blog
package com.georgefisher.tfidf; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; //MapReduce TF-IDF // //step 4 //------ //map in (word@doc, n;N;df) // out (word@doc, [tf, idf, tfidf]) //reduce none, map-only public class TFIDF_step4 { // ============ MAPPER =============== public static class Map extends Mapper<Text, Text, Text, Text> { private static int fileCount; public void setup(Context context) throws IOException { // get the number of documents in the original input folder Configuration conf = context.getConfiguration(); FileSystem fs = FileSystem.get(conf); Path pt = new Path(conf.get("originalInputDir")); ContentSummary cs = fs.getContentSummary(pt); fileCount = (int)cs.getFileCount(); } private final static Text word_doc = new Text(); private static Text result = new Text(); //map in (word@doc, n;N;df) // out (word@doc, [tf, idf, tfidf]) @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { // NOTE: since this step is map-only, its logic could easily be // incorporated into a loop at the end of the previous // step if the cost of job setup and teardown exceeds // the desire for clear code String[] n_N_m = value.toString().split(";"); Integer n = Integer.parseInt(n_N_m[0]); //Integer N = Integer.parseInt(n_N_m[1]); Integer df = Integer.parseInt(n_N_m[2]); double tf = (double)n; double idf = Math.log((double)fileCount / (double)df); double tfidf = tf * idf; word_doc.set(key); result.set("[tf="+n + " idf=log(fileCount/df)=log("+fileCount+"/"+df+")="+idf+ " tfidf=tf*idf="+tfidf+"]"); context.write(word_doc, result); } } }TF-IDF Step5 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019