PIG UDF with testNG test case – concatenate two strings

By | July 4, 2016
PIG UDF class


package org.puneetha.pig.udf;

import java.io.IOException;

import org.apache.log4j.Logger;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;

/***
 * 
 * 
 * @author Puneetha
 *
 */
public final class ConcatStrPig extends EvalFunc<string>{
	private static final Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[0].getClassName());
	@Override
	public String exec(final Tuple input) throws IOException {
		logger.debug("Tuple=" + input.toString());
		String separator = " ";
		StringBuilder result = new StringBuilder();
		
		if (input == null || input.size() == 0 || input.size() < 2){
			result.append("NULL");
			return result.toString();
		}
		
		try{
			String param1 = (String)input.get(0);
			String param2 = (String)input.get(1);
			
			result.append(param1.toString());
			result.append(separator);
			result.append(param2.toString());
		}catch(Exception e){
			throw new IOException("Exception processing input row: ", e);
		}
		
		return result.toString();
	}
}

Test case – TestNG


package org.puneetha.pig.udf;

import org.apache.pig.data.DefaultTuple;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;

/***
 * 
 * 
 * @author Puneetha
 *
 */
public class ConcatStrPigTest {
	@DataProvider(name = "dataProvider")
	public static String[][] inputData() {
		String[][] testStrSet = { 
			    {"hello" , "world" , "hello world"},
			    {"this is an" , "example" , "this is an example!"}
			};
		 return testStrSet;
	}
	
	@Test(dataProvider = "dataProvider" )
	public void testEvaluate(String param1, String param2, String expectedResultStr) throws Exception {		
		DefaultTuple inputDefaultTuple = new DefaultTuple();
		
		try {			
				inputDefaultTuple = new DefaultTuple();
				inputDefaultTuple.append(param1);
				inputDefaultTuple.append(param2);

				Assert.assertEquals(expectedResultStr, new ConcatStrPig().exec(inputDefaultTuple));
		}catch(Exception e){
		    e.printStackTrace();
		    Assert.fail();
		}
	}	
}

log4j.properties


# Root logger option
log4j.rootLogger=DEBUG, stdout

# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n

pom.xml


<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>custom</groupId>
	<artifactId>org.puneetha</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>pig_udf</name>
	<url>http://maven.apache.org</url>

	<properties>
		<project.finalname>pig_udf_v1</project.finalname>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<cdh.version>cdh5.5.2</cdh.version>
		<hadoop.version>2.6.0-${cdh.version}</hadoop.version>
		<hive.version>1.1.0-${cdh.version}</hive.version>
		<pig.version>0.12.0-${cdh.version}</pig.version>
		<log4j.version>1.2.17</log4j.version>
		<maven_jar_plugin.version>2.5</maven_jar_plugin.version>
		<codehaus.version>1.2.1</codehaus.version>
		<testng.version>6.9.10</testng.version>
		<junit.version>4.8.1</junit.version>
	</properties>

	<dependencies>
		<!-- Log4j - Logging -->
		<dependency>
			<groupId>log4j</groupId>
			<artifactId>log4j</artifactId>
			<version>${log4j.version}</version>
		</dependency>

		<!-- Testing -->
		<!-- Test NG -->
		<dependency>
			<groupId>org.testng</groupId>
			<artifactId>testng</artifactId>
			<version>${testng.version}</version>
		</dependency>
		
		<!-- Hadoop -->
		<!-- Cloudera Core Dependencies -->
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>${hadoop.version}</version>
		</dependency>

		<!-- Cloudera Hive Dependencies -->
		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-jdbc</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-metastore</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-service</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<!-- Cloudera PIG Dependencies -->
		<dependency>
			<groupId>org.apache.pig</groupId>
			<artifactId>pig</artifactId>
			<version>${pig.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.pig</groupId>
			<artifactId>pigunit</artifactId>
			<version>${pig.version}</version>
		</dependency>

		<!-- Maven Plugin -->
		<dependency>
			<groupId>org.apache.maven.plugins</groupId>
			<artifactId>maven-clean-plugin</artifactId>
			<version>${maven_jar_plugin.version}</version>
		</dependency>
	</dependencies>

	<build>
		<finalName>${project.finalname}</finalName>
		<pluginManagement>
			<plugins>
				<plugin>
					<groupId>org.codehaus.mojo</groupId>
					<artifactId>exec-maven-plugin</artifactId>
					<version>${codehaus.version}</version>
				</plugin>
			</plugins>
		</pluginManagement>
		<plugins>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-jar-plugin</artifactId>
				<version>${maven_jar_plugin.version}</version>
			</plugin>
		</plugins>
	</build>

	<repositories>
		<repository>
			<id>cloudera-repo</id>
			<url>http://repository.cloudera.com/artifactory/cloudera-repos/</url>
		</repository>
	</repositories>
</project>

Using the UDF


REGISTER '/home/localuser/pig_udf_v1.jar';
DEFINE udf_concat org.puneetha.pig.udf.ConcatStrPig();

-- Job name
SET job.name 'Test concatenation udf - pig';

A = LOAD '/user/cloudera/wordcount/output/' 
USING PigStorage('\t') 
AS (field1:CHARARRAY, field2:CHARARRAY);

B = FOREACH A GENERATE
,
,
udf_concat(, )
;

DUMP B
Category: Pig

Leave a Reply

Your email address will not be published. Required fields are marked *