Related Topics
No topics are associated with this blog
package com.georgefisher.tfidf; import java.io.IOException; import java.util.HashSet; import java.util.Set; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileSplit; //MapReduce TF-IDF // //step 1 //------ //map in (byteCount, line) // out (word@doc, 1) // //reduce in (word@doc, [count, count, ...]) // out (word@doc, n) public class TFIDF_step1 { // ============ MAPPER =============== public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable ONE = new IntWritable(1); private static Text word_doc = new Text(); // NOTE: stemming might also be considered // NOTE: at scale, memory may be an issue; // possibly forcing the inclusion of certain high-frequency // words that are not technically stop words. private static Set<String> stopWords; static { // we stop ALL words less than 3 in length stopWords = new HashSet<String>(); stopWords.add("about"); stopWords.add("and"); stopWords.add("are"); stopWords.add("com"); stopWords.add("for"); stopWords.add("from"); stopWords.add("how"); stopWords.add("that"); stopWords.add("the"); stopWords.add("this"); stopWords.add("was"); stopWords.add("what"); stopWords.add("when"); stopWords.add("where"); stopWords.add("with"); stopWords.add("who"); stopWords.add("will"); stopWords.add("the"); stopWords.add("www"); stopWords.add("verse"); // for shakespeare's sonnets } //map in (byteCount, line) // out (word@doc, 1) @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString().toLowerCase(); String[] words = line.split("\\W+"); // skip stop words and junk for (int i=0; i < words.length; i++) { String word = words[i]; if (word.length() < 3 || // note !Character.isLetter(word.charAt(0)) || Character.isDigit(word.charAt(0)) || stopWords.contains(word) || word.contains("_")) { continue; } String fileName = ((FileSplit) context.getInputSplit()).getPath().getName(); word_doc.set(word+"@"+fileName); context.write(word_doc, ONE); } } } // ============ REDUCER =============== public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> { private static IntWritable n = new IntWritable(); //reduce in (word@doc, [count, count, ...]) // out (word@doc, n) @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } n.set(sum); context.write(key, n); } } }TF-IDF Step2 ->
Originally published: Wednesday, February 04, 2015; most-recently modified: Thursday, June 06, 2019