Hive UDF with testNG test case – concatenate two strings

By | July 4, 2016
Hive UDF class


package org.puneetha.hive.udf;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.Text;

import org.apache.log4j.Logger;

import org.apache.hadoop.hive.ql.exec.Description;

/***
 * 
 * 
 * @author Puneetha
 *
 */
@Description(name = "udf_concat"
			, value = "_FUNC_(STRING, STRING) - RETURN_TYPE(STRING)\n"
					+ "Description: Concatenate two strings, separated by spaces"
			, extended = "Example:\n"
				+ "  > SELECT udf_concat('hello','world') FROM src;\n"
				+ " hello world"
			)
@UDFType(deterministic=true)
public final class ConcatStr extends UDF  {	 
	private static final Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[0].getClassName());
	
	public Text evaluate(final Text param1, final Text param2) throws HiveException {
		logger.debug("param1=" + param1.toString() + "; param2=" + param2.toString());
		String result = "";
		String separator = " ";
		result = param1.toString() + separator + param2.toString();
		return new Text(result);
	}
}

Test case – TestNG


package org.puneetha.hive.udf;

import org.apache.hadoop.io.Text;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

/***
 * 
 * 
 * @author Puneetha
 *
 */
public class ConcatStrTest {	
	@DataProvider(name = "dataProvider")
	public static String[][] inputData() {
		String[][] testStrSet = { 
					  {"hello" , "world" , "hello world"},
					  {"this is an" , "example" , "this is an example!"}
					};
					
		 return testStrSet;
	}

	@Test(dataProvider = "dataProvider" )
	public void testEvaluate(String param1, String param2, String expectedResultStr) throws Exception {
		ConcatStr concatStr1 = new ConcatStr();		
		
		try{
				Assert.assertEquals(new Text(expectedResultStr),
									concatStr1.evaluate(new Text(param1), new Text(param2))
									);
		}catch(Exception e){
		    e.printStackTrace();
		    Assert.fail();
		}
	}
}

log4j.properties


# Root logger option
log4j.rootLogger=DEBUG, stdout

# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n

pom.xml


<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>custom</groupId>
	<artifactId>org.puneetha</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>hive_udf</name>
	<url>http://maven.apache.org</url>

	<properties>
		<project.finalname>hive_udf_v1</project.finalname>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<cdh.version>cdh5.5.2</cdh.version>
		<hadoop.version>2.6.0-${cdh.version}</hadoop.version>
		<hive.version>1.1.0-${cdh.version}</hive.version>
		<pig.version>0.12.0-${cdh.version}</pig.version>
		<log4j.version>1.2.17</log4j.version>
		<maven_jar_plugin.version>2.5</maven_jar_plugin.version>
		<codehaus.version>1.2.1</codehaus.version>
		<testng.version>6.9.10</testng.version>
		<junit.version>4.8.1</junit.version>
	</properties>

	<dependencies>
		<!-- Log4j - Logging -->
		<dependency>
			<groupId>log4j</groupId>
			<artifactId>log4j</artifactId>
			<version>${log4j.version}</version>
		</dependency>

		<!-- Testing -->
		<!-- Test NG -->
		<dependency>
			<groupId>org.testng</groupId>
			<artifactId>testng</artifactId>
			<version>${testng.version}</version>
		</dependency>
		
		<!-- Hadoop -->
		<!-- Cloudera Core Dependencies -->
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>${hadoop.version}</version>
		</dependency>

		<!-- Cloudera Hive Dependencies -->
		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-jdbc</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-metastore</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-service</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<!-- Cloudera PIG Dependencies -->
		<dependency>
			<groupId>org.apache.pig</groupId>
			<artifactId>pig</artifactId>
			<version>${pig.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.pig</groupId>
			<artifactId>pigunit</artifactId>
			<version>${pig.version}</version>
		</dependency>

		<!-- Maven Plugin -->
		<dependency>
			<groupId>org.apache.maven.plugins</groupId>
			<artifactId>maven-clean-plugin</artifactId>
			<version>${maven_jar_plugin.version}</version>
		</dependency>
	</dependencies>

	<build>
		<finalName>${project.finalname}</finalName>
		<pluginManagement>
			<plugins>
				<plugin>
					<groupId>org.codehaus.mojo</groupId>
					<artifactId>exec-maven-plugin</artifactId>
					<version>${codehaus.version}</version>
				</plugin>
			</plugins>
		</pluginManagement>
		<plugins>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-jar-plugin</artifactId>
				<version>${maven_jar_plugin.version}</version>
			</plugin>
		</plugins>
	</build>

	<repositories>
		<repository>
			<id>cloudera-repo</id>
			<url>http://repository.cloudera.com/artifactory/cloudera-repos/</url>
		</repository>
	</repositories>
</project>

Deploy Hive UDF
Deploy a function:


CREATE FUNCTION udf_concat AS 'org.puneetha.hive.udf.ConcatStr' USING JAR 'hdfs:///jars/hive_udf_v1.jar';

Using the UDF:


SELECT udf_concat('hello','world');
Output: hello world

Describe function usage:


DESCRIBE FUNCTION udf_concat;
Output:
+--------------------------------------------------------------+--+
|                           tab_name                           |
+--------------------------------------------------------------+--+
| _FUNC_(STRING, STRING) - RETURN_TYPE(STRING)		       |
| Description: Concatenate two strings, separated by spaces    |
+--------------------------------------------------------------+--+

DESCRIBE FUNCTION EXTENDED udf_concat;
Output:
+--------------------------------------------------------------+--+
|                           tab_name                           |
+--------------------------------------------------------------+--+
| _FUNC_(STRING, STRING) - RETURN_TYPE(STRING)		       |
| Description: Concatenate two strings, separated by spaces    |
| Example:						       |
| > SELECT udf_concat('hello','world') FROM src;	       |
| hello world						       |
+--------------------------------------------------------------+--+

One thought on “Hive UDF with testNG test case – concatenate two strings

  1. Abarajithan SA

    HI Puneetha,

    Wonderful, it worked for me.. thanks a lot! I am going to follow your blog from now on.

    Thanks,

    Reply

Leave a Reply to Abarajithan SA Cancel reply

Your email address will not be published. Required fields are marked *