您绝对可以在没有外部GROUP BY
的情况下执行此操作:
CREATE OR REPLACE FUNCTION `dataset.sumdistinct` (values array<struct<id int64,val int64>>) as (
(
select
struct(
count(distinct x.id) as col1,sum(distinct x.val) as col2
) from unnest(values) as x
)
);
select sumdistinct(array_agg(struct(id as id,val as val))) from `dataset.table`
,
以下是用于BigQuery标准SQL
#standardSQL
CREATE TEMP FUNCTION SumDistinct(arr ANY TYPE) AS ((
SELECT AS STRUCT
COUNT(DISTINCT id) unique_ids,SUM(val) total_value
FROM (
SELECT ANY_VALUE(t).*
FROM UNNEST(arr) t
GROUP BY FORMAT('%t',t)
)
));
SELECT SumDistinct(ARRAY_AGG(STRUCT(id,val))).*
FROM `project.dataset.data`
如果要应用于您的问题的样本数据-结果为
Row unique_ids total_value
1 3 600
,
根据您要如何将不同的val解析为相同的id,可以在下面的sql中调整聚合函数(max(val)
):
with data as
(select 1 as id,100 as val union all
select 1,100 union all
select 1,100 union all
select 2,200 union all
select 2,200 union all
select 3,300 union all
select 3,300
)
SELECT count(1) as unique_ids,sum(val) as total_value
FROM (
SELECT id,max(val) val
FROM data
GROUP BY id
)
,
基于注释,看来您希望UDF能够调用内部的聚合函数。也许您正在寻找用户定义的聚合函数,BigQuery不支持该函数,但可以采用以下形式:
输出不是您期望的,因为UDF无法输出2列(如您的示例中所示),希望您有一个想法,需要对array_agg()字段进行操作,并且您的UDF在内部进行UNNEST()并能够使用系统集合函数,例如SUM():
CREATE TEMP FUNCTION sumdistinct (unique_key INT64,val_array ARRAY<INT64>) AS (
(SELECT COALESCE(ROUND(COALESCE(CAST((SUM(DISTINCT (CAST(ROUND(COALESCE(safe_cast(val_to_sum as float64),0)*(1/1000*1.0),9) AS NUMERIC) + (cast(cast(concat('0x',substr(to_hex(md5(CAST(unique_key AS STRING))),1,15)) as int64) as numeric) * 4294967296 + cast(cast(concat('0x',16,8)) as int64) as numeric)) * 0.000000001 )) - SUM(DISTINCT (cast(cast(concat('0x',8)) as int64) as numeric)) * 0.000000001) ) / (1/1000*1.0) AS FLOAT64),0),6),0)
FROM unnest(val_array) val_to_sum)
);
with data as
(select 1 as id,300
)
SELECT sumdistinct(id,array_agg(val))
FROM data
GROUP BY id
本文链接:https://www.f2er.com/3063695.html