package com.example.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
* 自定义UDF: 将string转换 for big 写
*/
public class UpperCaseUDF extends UDF {
/**
* evaluatemethod is UDF core, 必须implementation
* support重载, 可以 has many 个evaluatemethod
*/
public Text evaluate(Text input) {
// processingnull值
if (input == null) {
return null;
}
// 将string转换 for big 写
String upperCase = input.toString().toUpperCase();
return new Text(upperCase);
}
// 重载method, supportStringclass型输入
public Text evaluate(String input) {
if (input == null) {
return null;
}
return new Text(input.toUpperCase());
}
}
2.4 编译打package
usingMaven编译打package生成JARfile:
mvn clean package
2.5 in Hiveinregister and usingUDF
has 两种方式 in HiveinregisterUDF:
方式一: 临时register
临时register UDF只 in 当 before session has 效:
-- 添加JARpackage to Hivesession
hive> ADD JAR /path/to/hive-udf-1.0-SNAPSHOT.jar;
-- creation临时function
hive> CREATE TEMPORARY FUNCTION to_upper AS 'com.example.hive.udf.UpperCaseUDF';
-- using自定义function
hive> SELECT to_upper('hello world');
-- 输出: HELLO WORLD
-- query表data时using
hive> SELECT id, to_upper(username) AS upper_username FROM users;
方式二: 永久register
永久register UDF in 所 has sessionin has 效:
-- creation永久function (需要Hive 2.2.0+)
hive> CREATE FUNCTION to_upper AS 'com.example.hive.udf.UpperCaseUDF'
> USING JAR 'hdfs:///user/hive/udf/hive-udf-1.0-SNAPSHOT.jar';
-- 查看所 has function
hive> SHOW FUNCTIONS LIKE 'to_upper';
-- 查看function详情
hive> DESCRIBE FUNCTION to_upper;
-- 输出: to_upper(string) - Converts a string to uppercase
3. DevelopmentUDAF (user自定义aggregatefunction)
3.1 UDAFoverview
UDAF用于implementation自定义 aggregateoperation, such as求 and , 平均值, 最 big 值etc.. UDAF Development比UDF complex , 需要implementation以 under component:
package com.example.hive.udaf;
import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
/**
* 自定义UDAF: 计算平均值
*/
public class AvgUDAF extends UDAF {
/**
* Evaluator in 部class, implementationaggregate逻辑
*/
public static class AvgEvaluator implements UDAFEvaluator {
// storein间结果: sum and count
public static class PartialResult {
long count;
double sum;
}
private PartialResult partial;
/**
* 初始化method, in aggregate开始时调用
*/
@Override
public void init() {
partial = null;
}
/**
* iterationmethod, processing每个输入行
*/
public boolean iterate(DoubleWritable value) {
if (value == null) {
return true;
}
if (partial == null) {
partial = new PartialResult();
}
partial.sum += value.get();
partial.count++;
return true;
}
/**
* 终止局部aggregate, 返回in间结果
*/
public PartialResult terminatePartial() {
return partial == null ? null : partial;
}
/**
* mergemethod, merge many 个部分结果
*/
public boolean merge(PartialResult other) {
if (other == null) {
return true;
}
if (partial == null) {
partial = new PartialResult();
}
partial.sum += other.sum;
partial.count += other.count;
return true;
}
/**
* 终止全局aggregate, 返回最终结果
*/
public DoubleWritable terminate() {
if (partial == null || partial.count == 0) {
return null;
}
return new DoubleWritable(partial.sum / partial.count);
}
}
}
3.4 in HiveinusingUDAF
-- 添加JARpackage
hive> ADD JAR /path/to/hive-udf-1.0-SNAPSHOT.jar;
-- creation临时aggregatefunction
hive> CREATE TEMPORARY FUNCTION my_avg AS 'com.example.hive.udaf.AvgUDAF';
-- using自定义aggregatefunction
hive> SELECT my_avg(amount) AS avg_amount FROM sales;
-- 输出: 平均销售额
-- groupusing
hive> SELECT category, my_avg(price) AS avg_price FROM products GROUP BY category;
4. DevelopmentUDTF (user自定义表生成function)
4.1 UDTFoverview
UDTF用于将一行data转换 for many 行data, such as将array or JSONstring拆分 for many 行. UDTF Development需要inheritanceGenericUDTFclass, implementationinitialize, process and closemethod.
implementationprocessmethod, processing输入data并生成 many 行输出
implementationclosemethod, cleanresource
编译打package生成JARfile
in Hiveinregister and usingUDTF
4.3 writingUDTFclass
writing一个 simple UDTF, 用于将string按指定分隔符拆分 for many 行:
package com.example.hive.udtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* 自定义UDTF: 将string按分隔符拆分 for many 行
*/
public class SplitUDTF extends GenericUDTF {
private String delimiter = ",";
private final transient Object[] forwardObj = new Object[1];
/**
* 初始化method, 定义输出列 class型 and 名称
*/
@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
// processingparameter
if (argOIs.length > 1) {
// 第二个parameter for 分隔符
delimiter = argOIs[1].getTypeName();
}
// 定义输出列名
List fieldNames = new ArrayList<>();
fieldNames.add("word");
// 定义输出列class型
List fieldOIs = new ArrayList<>();
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
/**
* processing输入data, 生成 many 行输出
*/
@Override
public void process(Object[] args) throws HiveException {
// processing输入parameter
if (args == null || args[0] == null) {
return;
}
String input = args[0].toString();
// 按分隔符拆分string
String[] parts = input.split(delimiter);
// 生成 many 行输出
for (String part : parts) {
forwardObj[0] = part;
forward(forwardObj); // 输出一行data
}
}
/**
* cleanresource
*/
@Override
public void close() throws HiveException {
// 无需特殊clean
}
}
4.4 in HiveinusingUDTF
-- 添加JARpackage
hive> ADD JAR /path/to/hive-udf-1.0-SNAPSHOT.jar;
-- creation临时表生成function
hive> CREATE TEMPORARY FUNCTION my_split AS 'com.example.hive.udtf.SplitUDTF';
-- usingUDTF (LATERAL VIEW方式)
hive> SELECT id, word
> FROM users
> LATERAL VIEW my_split(hobbies, ',') exploded_hobbies AS word;
-- 输出: 每个user 爱 good 拆分 for many 行
-- 直接usingUDTF
hive> SELECT my_split('a,b,c,d');
-- 输出:
-- a
-- b
-- c
-- d
-- using自定义分隔符
hive> SELECT my_split('x|y|z', '|');
-- 输出:
-- x
-- y
-- z
5. 自定义functionbest practices
5.1 Developmentbest practices
processingnull值: in 所 has 自定义functionin, 都需要妥善processingnull输入值, 避免空指针exception