Pattern matching for files within a Mapreduce program – given hdfs path – using new api 2

By | October 25, 2014

Driver Class:


package org.puneetha.patternMatching;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordcountDriver extends Configured implements Tool {
	public int run(String[] args) throws Exception {
		Job job = Job.getInstance(getConf());

		/*
		 * ... Other Driver class code ...
		 */

		Path inputFilePath = new Path(args[0]);
		Path outputFilePath = new Path(args[1]);

		// To list all the filenames
		FileSystem fs = FileSystem.newInstance(getConf());
		FileStatus[] status_list = fs.listStatus(outputFilePath);
		if (status_list != null) {
			for (FileStatus status : status_list) {
				// To print full path
				// System.out.println(status.getPath());

				// Get the filename
				String filename = status.getPath().getName();

				/* Pattern to be matched */
				String pattern = "part-r-*";
				Pattern regex = Pattern.compile(pattern);

				Matcher matcher = regex.matcher(filename);

				Path fullFilePath = new Path(inputFilePath + "/" + filename);
				if (matcher.find()) {
					System.out.println("Matched => " + filename);

					/* Any action */
					FileInputFormat.addInputPath(job, fullFilePath);
				} else {
					System.out.println("Not Matched => " + filename);
					/* Any action */
					fs.delete(fullFilePath, true);
				}
			}
		}

		return job.waitForCompletion(true) ? 0: 1;
	}

	public static void main(String[] args) throws Exception {
		WordcountDriver wordcountDriver = new WordcountDriver();
		int res = ToolRunner.run(wordcountDriver, args);
		System.exit(res);
	}
}

2 thoughts on “Pattern matching for files within a Mapreduce program – given hdfs path – using new api 2

  1. manish

    i want a mapreduce code in a hadoop to match the particular pattern supoose for example i obtain some set of pakets from tcp dump and put it in wireshark then based on that data i just want to analyze that data in hadoop how it ca be done???plz help the data generated size is nearly 1.50 gb…plz give me some algo or code for that in mapreduce……..thank you

    Reply
  2. vinoth

    Thanks puneetha,
    I am an experienced java developer . this MR stuff helped me . your painting is very good.

    Reply

Leave a Reply

Your email address will not be published. Required fields are marked *