DistributionMatchReduce version 1.7, DistributionMatchMultiInput version 1.4
SELECT * FROM DistributionMatchReduce ( ON DistributionMatchMultiInput ( ON (SELECT RANK() OVER (PARTITION BY column [,...] ORDER BY column) AS rank, * FROM input_table WHERE column IS NOT NULL) AS InputTable PARTITION BY ANY ON (SELECT column[,...] COUNT(*) AS group_size, AVG (column) AS mean, STDDEV (column) AS sd, CASE WHEN MIN (column) > 0 THEN AVG (LN ( CASE WHEN column > 0 THEN column ELSE 1 END) ) ELSE 0 END AS mean_of_ln, CASE WHEN MIN (column) > 0 THEN STDDEV (LN ( CASE WHEN column > 0 THEN column ELSE 1 END) ) ELSE -1 END AS sd_of_ln, Max (column) AS maximum, MIN (column) AS minimum FROM input_table WHERE column IS NOT NULL GROUP BY column[,...] ) AS GroupStatistics DIMENSION USING TargetColumn ('target_column') [ Tests ('test' [,...]) ] [ Distributions ('distribution1:parameter1',...) ] [ GroupByColumns ({ 'group_column' | group_column_range }[,...]) ] MinGroupSize (minGroupSize) [ NumCell (cell_Size) ] ) AS alias_1 PARTITION BY column[,...] [ USING NumTopMatches ('num_top_matches') ] ) AS alias_2;
For continuous distributions, if your input table already includes a rank column, replace this clause:
ON (SELECT RANK()...
with this clause:
ON SELECT * FROM input_table.