DistributionMatchReduce version 1.7, DistributionMatchMultiInput version 1.4
SELECT * FROM DistributionMatchReduce ( ON DistributionMatchMultiInput ( ON (SELECT RANK() OVER (PARTITION BY column [,...] ORDER BY column) AS rank, * FROM input_table WHERE column IS NOT NULL ) AS InputTable PARTITION BY column [,...] ON (SELECT col [,...], COUNT(*) AS group_size FROM input_table WHERE column IS NOT NULL c GROUP BY column [,...] ) AS GroupStatistics PARTITION BY column [,...] USING TargetColumn ('target_column') [ Tests ('test' [,...]) ] Distributions ('distribution:parameters' [,...]) [ GroupByColumns ({ 'group_column' | group_column_range }[,...]) ] [ MinGroupSize (min_group_size) ] [ NumCell (cell_size) ] ) AS alias_1 PARTITION BY column [,...] ) AS alias_2;
If your input table already includes a rank column, replace this clause:
ON (SELECT RANK()...
with this clause:
ON SELECT * FROM input_table .