Hive UDFs – Simple and Generic UDFs

By | February 19, 2017
Hive UDFs:
These are regular user-defined functions that operate row-wise and output one result for one row, such as most built-in mathematics and string functions.


Ex: 
SELECT LOWER(str) FROM table_name;
SELECT CONCAT(column1,column2) AS x FROM table_name;

There are 2 ways of writing the UDFs

  1. Simple – extend UDF class
  2. Generic – extend GenericUDF class

In this post, we will create a UDF to concatenate strings, and implement by extending simple and Generic class.

1. Simple – extend <org.apache.hadoop.hive.ql.exec.UDF> class

To write a Simple UDF, below 2 steps are neccessary:
1. Extend the org.apache.hadoop.hive.ql.exec.UDF class
2. Write an “evaluate” method.


package org.puneetha.hive.udf.udfconcat;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;

/***
 * 
 * 
 * @author Puneetha
 *
 */
@Description(name = "udf_concat",
	    value = "_FUNC_(str1, str2, ... strN) - returns the concatenation of str1, str2, ... strN",
	    extended = "Returns NULL if any argument is NULL.\n"
					    + "Example:\n"
					    + "  > SELECT _FUNC_('abc', 'def') FROM src LIMIT 1;\n"
					    + "  'abcdef'"
					    + "  > SELECT _FUNC_(1, 2) FROM src LIMIT 1;\n"
					    + "  '12'" 
			    	)
public class UDFConcat extends UDF {
	public UDFConcat() {
	}

	private Text text = new Text();

	public Text evaluate(Text... args) {
		text.clear();
		for (Text arg : args) {
			if (arg == null) {
				return null;
			}
			text.append(arg.getBytes(), 0, arg.getLength());
		}
		return text;
	}

	public Text evaluate(IntWritable... args) {
		text.clear();
		for (IntWritable arg : args) {
			if (arg.toString() == null) {
				return null;
			}
			text.append(arg.toString().getBytes(), 0, (arg.toString()).length());
		}
		return text;
	}
}

Test Case:


package org.puneetha.hive.udf.udfconcat;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

/***
 * 
 * 
 * @author Puneetha
 *
 */
public class UDFConcatTest {	 
	/* Test 2 arguments as input - for string input */
	@DataProvider(name = "dataProvider1")
	public static String[][] inputData1() {
		String[][] testStrSet = { 
				// Success
				{"hello" , "world" , "helloworld"}
				,{"welcome" , " to the program" , "welcome to the program"}
				,{"hello" , "world" , "hello9world"}
				,{"" , "" , ""}
			};
					
		 return testStrSet;
	}
	

	@Test(dataProvider = "dataProvider1") 
	public void testEvaluate1(String param1, String param2, String expectedResultStr) throws Exception {
		try {
			UDFConcat udfConcat = new UDFConcat();

			Assert.assertEquals(new Text(expectedResultStr),
							udfConcat.evaluate(new Text(param1),
												new Text(param2)
											)
							);
		} catch (Exception e) {
			e.printStackTrace();
			Assert.fail();
		}
	}
	
	
	/* Test 3 arguments as input - for string input */
	@DataProvider(name = "dataProvider2")
	public static String[][] inputData2() {
		String[][] testStrSet = { 
				// Success
				{"how" , " are" , " you", "how are you"}
				,{"its" , " nice" , " out there!", "its nice out there!"}
				,{"" , "" , "" , ""}
			};
					
		 return testStrSet;
	}

	
	@Test(dataProvider = "dataProvider2") 
	public void testEvaluate2(String param1, String param2, String param3, String expectedResultStr) throws Exception {
		try {
			UDFConcat udfConcat = new UDFConcat();

			Assert.assertEquals(new Text(expectedResultStr),
							udfConcat.evaluate(new Text(param1),
												new Text(param2),
												new Text(param3)
											)
							);
		} catch (Exception e) {
			e.printStackTrace();
			Assert.fail();
		}
	}
	
	
	@DataProvider(name = "dataProvider3")
	public static Integer[][] inputData3() {
		Integer[][] testIntSet = { 
				// Success
				{1 , 2 , 12}
			};
					
		 return testIntSet;
	}

	
	@Test(dataProvider = "dataProvider3") 
	public void testEvaluate3(int param1, int param2, int expectedResultStr) throws Exception {
		try {
			UDFConcat udfConcat = new UDFConcat();

			Assert.assertEquals(new Text(String.valueOf(expectedResultStr)),
							udfConcat.evaluate(new IntWritable(param1),
												new IntWritable(param2)
											)
							);
		} catch (Exception e) {
			e.printStackTrace();
			Assert.fail();
		}
	}
}



2. Generic – extend <org.apache.hadoop.hive.ql.udf.generic.GenericUDF> class
To write a Simple UDF, below 2 steps are neccessary:

  1. Extend the org.apache.hadoop.hive.ql.udf.generic.GenericUDF class
  2. Write the “initialize” method. This will be called once and only once per GenericUDF instance.
  3. Write an “evaluate” method.
  4. Override the method “getDisplayString”. This method will get the String to be displayed in explain.


pom.xml (This file is common for both Simple and Generic UDF


<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>custom</groupId>
	<artifactId>org.puneetha.hive.udf</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>hive_udf</name>
	<url>http://maven.apache.org</url>

	<properties>
		<project.finalname>hive_udf_v1</project.finalname>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<cdh.version>cdh5.5.2</cdh.version>
		<hadoop.version>2.6.0-${cdh.version}</hadoop.version>
		<hive.version>1.1.0-${cdh.version}</hive.version>
		<pig.version>0.12.0-${cdh.version}</pig.version>
		<log4j.version>1.2.17</log4j.version>
		<maven_jar_plugin.version>2.5</maven_jar_plugin.version>
		<codehaus.version>1.2.1</codehaus.version>
		<mockito.version>1.10.19</mockito.version>
		<testng.version>6.9.10</testng.version>
		<junit.version>4.8.1</junit.version>
		<java.home>C:\Program Files\Java\jdk1.8.0_121</java.home>
	</properties>

	<dependencies>
		<!-- Log4j - Logging -->
		<dependency>
			<groupId>log4j</groupId>
			<artifactId>log4j</artifactId>
			<version>${log4j.version}</version>
		</dependency>

		<!-- Testing -->
		<!-- https://mvnrepository.com/artifact/org.mockito/mockito-all -->
		<dependency>
			<groupId>org.mockito</groupId>
			<artifactId>mockito-all</artifactId>
			<version>${mockito.version}</version>
		</dependency>


		<!-- Test NG -->
		<dependency>
			<groupId>org.testng</groupId>
			<artifactId>testng</artifactId>
			<version>${testng.version}</version>
		</dependency>

		<!-- Hadoop -->
		<!-- Cloudera Core Dependencies -->
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>${hadoop.version}</version>
		</dependency>


		<!-- Cloudera Hive Dependencies -->
		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-jdbc</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-metastore</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-service</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<!-- Cloudera PIG Dependencies -->
		<dependency>
			<groupId>org.apache.pig</groupId>
			<artifactId>pig</artifactId>
			<version>${pig.version}</version>
		</dependency>


		<!-- Maven Plugin -->
		<dependency>
			<groupId>org.apache.maven.plugins</groupId>
			<artifactId>maven-clean-plugin</artifactId>
			<version>${maven_jar_plugin.version}</version>
		</dependency>

		<dependency>
			<groupId>jdk.tools</groupId>
			<artifactId>jdk.tools</artifactId>
			<version>1.8.0_121</version>
			<scope>system</scope>
			<systemPath>${java.home}/lib/tools.jar</systemPath>
		</dependency>
	</dependencies>

	<build>
		<finalName>${project.finalname}</finalName>
		<pluginManagement>
			<plugins>
				<plugin>
					<groupId>org.codehaus.mojo</groupId>
					<artifactId>exec-maven-plugin</artifactId>
					<version>${codehaus.version}</version>
				</plugin>
			</plugins>
		</pluginManagement>
		<plugins>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-jar-plugin</artifactId>
				<version>${maven_jar_plugin.version}</version>
			</plugin>
		</plugins>
	</build>

	<repositories>
		<repository>
			<id>cloudera-repo</id>
			<url>http://repository.cloudera.com/artifactory/cloudera-repos/</url>
		</repository>
	</repositories>
</project>

Leave a Reply

Your email address will not be published. Required fields are marked *