Related Topics
No topics are associated with this blog
package com.georgefisher.tfidf;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//MapReduce TF-IDF
//
//step 4
//------
//map in (word@doc, n;N;df)
// out (word@doc, [tf, idf, tfidf])
//reduce none, map-only
public class TFIDF_step4 {
// ============ MAPPER ===============
public static class Map
extends Mapper<Text, Text, Text, Text> {
private static int fileCount;
public void setup(Context context) throws IOException {
// get the number of documents in the original input folder
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.get(conf);
Path pt = new Path(conf.get("originalInputDir"));
ContentSummary cs = fs.getContentSummary(pt);
fileCount = (int)cs.getFileCount();
}
private final static Text word_doc = new Text();
private static Text result = new Text();
//map in (word@doc, n;N;df)
// out (word@doc, [tf, idf, tfidf])
@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
// NOTE: since this step is map-only, its logic could easily be
// incorporated into a loop at the end of the previous
// step if the cost of job setup and teardown exceeds
// the desire for clear code
String[] n_N_m = value.toString().split(";");
Integer n = Integer.parseInt(n_N_m[0]);
//Integer N = Integer.parseInt(n_N_m[1]);
Integer df = Integer.parseInt(n_N_m[2]);
double tf = (double)n;
double idf = Math.log((double)fileCount / (double)df);
double tfidf = tf * idf;
word_doc.set(key);
result.set("[tf="+n +
" idf=log(fileCount/df)=log("+fileCount+"/"+df+")="+idf+
" tfidf=tf*idf="+tfidf+"]");
context.write(word_doc, result);
}
}
}
TF-IDF Step5 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019