DB2 OLAP 函数使用,
汇总(sum):select empno, firstname, lastname, sum(sales) as tot_sales, (salary + bonus + comm) as compensation
from employee, sales
where sex='M' and year(sales_date) = 1996 and lastname = sales_person
group by empno, firstname, lastname, (salary + bonus + comm)
递归汇总(rollup):
select year(sales_date) as year, count(*) as tot_sales
from sales
group by rollup (year(sales_date))
year tot_sales
----- -------------
- 41
1995 5
1996 36
select year(sales_date) as year, region, count(*) as tot_sales
from sales
group by rollup (year(sales_date), region)
select year(sales_date) as year, region, sales_person, count(*) as tot_sales
from sales
group by rollup (year(sales_date), region, sales_person)
cube是立方体的意思,感觉就是全方位查看数据:
select year(sales_date) as year, region, count(*) as tot_sales
from sales
group by cube (year(sales_date), region)
select year(sales_date) as year, region, sales_person, count(*) as tot_sales
from sales
group by cube (year(sales_date), region, sales_person)
其它 OLAP :
count,average, stddev, corr, regr_* rownumber, rank, windows aggregates
简单统计 statistics:
select country, year, count(*) as count, sum(amount) as sum, avg(amount) as avg, max(amount) as max, stddev(amount) as stddev
from mytable
group by country, year
建立一个视图,包括两列,平均值和stdev值,便于进行观测:
create view profile(cust_id, avg_amt, sd_amt) as
select cust_id, avg(charge_amt), stddev(charge_amt) from trans
where date between '2002-0101' and '2002-03-31'
group by cust_id
建立trigger,插入的时候如果数据大于平均值则插入到另外一张表
Create trigger big_chrg
after insert on trans
referencing new as newrow for each row mode db2sql
when (newrow.charge_amt > (select avg_amt + 2.0 * sd_amt from profile where profile.cust_id = newrow.cust_id))
insert into big_charges (cust_id,charge_amt)
values(newrow.cust_id, newrow.charge_amt))
Equi-width histogram:
with dt as (
select
t.transid,
sum(amount) as trans_amt,
case
when sum(amount)/3000 < 0 then 0
when sum(amount)/3000 >19 then 19
else int(sum(amount)/3000)
end as bucket
from trans t, transitem ti
where t.transid=ti.transid
group by t.transid
)
select bucket,count(bucket) as height, (bucket+1)*3000 as max_amt
from dt
group by bucket;
Equi-Height histogram:
with dt as (
select
t.transid,
sum(amount) as trans_amt,
rownumber( ) over(order by sum(amount))*10/(select count(distinct transid)+1 from stars.transitem) as bucket
from stars.trans t, stars.transitem ti
where t.transid=ti.transid
group by t.transid
)
select bucket,count(bucket) as b_count, max(trans_amt) as part_value
from dt
group by bucket;
Note:
There are 3 ranking functions: rank(), denserank(), rownumber().
- row_number() & rownumber are synonyms. Also dense_rank & denserank are synonyms.
The 3 ranking functions are ranking rows, that is, they assign numbers (1,2,3, etc). If all results are different - all 3 functions produce the same result. If we have duplicates - then:
- rank() - will give them the same number - and skip the next number(s)
- denserank() - will give them the same number - and not skip the next number(s)
- rownumber() - will not give same number and will not skip. Will simply give different numbers to every row. This is the only ranking function that does not require an ordering.
select empnum, dept, salary,
rank() over(partition by dept
order by salary desc nulls last) as rank,
dense_rank() over (partition by dept order by salary desc nulls last)as denserank,
row_number() over (partition by dept order by salary desc nulls last)as rownumber
from emptab;
EMPNUMDEPT SALARYRANKDENSERANK ROWNUMBER
---------- ------------------- ---------
6 1 78000 1 1 1
2 1 75000 2 2 2
7 1 75000 2 2 3
11 1 53000 4 3 4
5 1 52000 5 4 5
1 1 50000 6 5 6
--------------------------------------------------
9 2 51000 1 1 1
4 2 - 2 2 2
over() - to specify sets. This includes partitionning and ordering inside sets (see many examples below).
partition by
order by ( asc , desc ,nulls last )
rows
between ... and ...
N preceding & N following
unbound preceding&unbound following (to include the entire preceding/following partition(s))
current row
range between unbound preceding and unbound following
range between current row and unbound following
Example: Smoothed Time Series (over 1 week : 3 preceding days and 3 following days):
select
date, symbol, close_price,
avg(close_price) over(order by date rows between 3 preceding and 3 following) as smooth_cp
from stocktab
where symbol = 'IBM' and date between '1999-08-01' and '1999-09-01';
Attention: adding ordering to the set changes the behaviour of calculations to become "cumulative". That means, calculating on the current row - and all rows in the set that precede it with respect to the ordering.
scalar-aggregate functions - (ver.7 and up) - perform scalar calculations on values from multiple rows within a set to compute the result
select dept, salary,
sum(salary) over(partition by dept) as deptsum,
avg(salary) over(partition by dept) as avgsal,
count(*) over(partition by dept) as deptcount,
max(salary) over(partition by dept) as maxsal
from emptab;
DEPTSALARYDEPTSUMAVGSALDEPTCOUNT MAXSAL
----- ------- -------- ------- --------- --------
1 50000 383000 63833 6 78000
1 75000 383000 63833 6 78000
1 52000 383000 63833 6 78000
1 78000 383000 63833 6 78000
1 75000 383000 63833 6 78000
1 53000 383000 63833 6 78000
2 -51000 51000 2 51000
2 5100051000 51000 2 51000
3 79000 209000 69666 3 79000
3 55000 209000 69666 3 79000
3 75000 209000 69666 3 79000
- -84000 84000 2 84000
- 8400084000 84000 2 84000
Note: the query above contains no GROUP BY clause. Instead, the OVER clause is used to partition the data so that the sum function is
computed over rows in the same department, and the sum of all the salaries in each department is returned for each row within the department.
Adding ordering into a set turns calculations into cummulative over the set:
select
date, sales,
sum(sales) over(order by date) as cume_sum,
count(*) over(order by date) as setcount
from sales
where year(date) = 2000;
DATE SALES CUME_SUM SETCOUNT
---------- ------------ ------------ ---------
01/01/2000 968871.12 968871.12 1
02/01/2000 80050.05 1048921.17 2
03/01/2000 757866.14 1806787.31 3
04/01/2000 58748.13 1865535.44 4
05/01/2000 40711.69 1906247.13 5
06/01/2000 241187.78 2147434.91 6
07/01/2000 954924.16 3102359.07 7
08/01/2000 502822.96 3605182.03 8
09/01/2000 97201.45 3702383.48 9
10/01/2000 853999.45 4556382.93 10
11/01/2000 358775.59 4915158.52 11
12/01/2000 437513.35 5352671.87 12
Example for several years:
select date, sales,
sum(sales) over(partition by year(date)
order by month(date)) as cume_sum
from sales
where year(date) >= 2000;
Note: the order of evaluation of a query is as follows:
1.From Clause
2.Where Clause
3.Group By Clause
4.Having Clause
5.Select List (over...)
Example taking advantage of this order:
select
year(date) as year,
sum(sales) as sum,
sum(sum(sales)) over(order by year(date)) as cume_sum
from sales
where year(date) >= 1995
group by year(date);
Correlation: ( 1 (or -1) - perfect positive (or negative) relationship ):
select country, state, correlation(annual_purchases, income) as correlation
from mytab
group by country, state
having abs(correlation(annual_purchases, income)) > 0.1;
Note: similar function - covariance()
select a.custid as custid1, b.custid as custid2, corr(a.amount, b.amount) as corr
from mytab a, mytab b
where a.prodid=b.prodid and a.custid < b.custid
broup by a.custid, b.custid
having corr(a.amount, b.amount) >=0.5 and count(*) > 100
order by corr desc;
======
with
dt (prod, year, sales0, sales1,sales2) as (
select
prod, year, total_sales,
max(total_sales) over(partitionby prod order by year rows between 1 preceding and 1 preceding),
max(total_sales) over(partitionby prod order by year rows between 2 preceding and 2 preceding)
from mytab
)
select
prod,
corr(sales0,sales1)*100 as "corr1(%)",
corr(sales0,sales2)*100 as "corr2(%)"
from dt
group by prod;
Least-Squares Fit (Linear Regression):y=ax+b for a set of non-null (y,x) values:
select
regr_count(sales, ad_budget) as num_cities,
regr_slope(sales, ad_budget) as a,
regr_icpt(sales, ad_budget) as b,
regr_r2(sales, ad_budget) as r-squared
from ad_camp;
Here R-squared is a measure of the quality of the fit (roughly, the square of the corr of x and y)
If you need y = ax*x + b - use regr_slope(y,x*x)
etc.
Note: for non-linear curves you can not compute R-squared like this:
select regr_r2(log(hits), log(days)) as r2 from traffic_data;
because it will give you 0.99, instead of 0.95.
A correct way is to use the with ... expression to:
1. calculate the regr_slope and regr_icpt for the model
2. calculate the residuals (differences between data and the model for each point)
3. calculate the r2 like this:
select 1e0 - (sum(error*error)/regr_syy(hits,days)) as r2
To find if one or two points strogly influence the model - we can use HAT Matrix.
Here is how to do the HAT Diagonal Computation:
with stats(mx,mx2,sxx) as(
select
regr_avgx(sales,ad_budget),
regr_avgx(sales,ad_budget*ad_budget),
regr_sxx(sales,ad_budget)
from mytab
)
select d.label as city, (s.mx2 - 2*s.mx*d.x + d.x*d.x) / s.sxx as HAT
from xy_data d, stats s
order by HAT desc;
Fit the line in the form y=ax :
We need to computer a as following: a = (x1*y1 + x2*y2 +..+xn*yn)/(x1^2 + ... + xn^2)
Note that regr_sxx = (x1-mx)^2 + ... + (xn-mn)^2
(And similarly regr_sxy)
So:
(x1^2 + ... + xn^2) = regr_sxx + n*mx^2
(x1*y1 + x2*y2 +..+xn*yn) = regr_sxy + n*mx*my
Thus we get:
select
regr_count(kwh,hours_run) as num_machines,
(regr_sxy(kwh,hours_run) + regr_count(kwh,hours_run) * regr_avgx(kwh,hours_run) * regr_avgy(kwh,hours_run))
/
(regr_sxx(kwh,hours_run) + regr_count(kwh,hours_run) * regr_avgx(kwh,hours_run) * regr_avgx(kwh,hours_run))
as a
from power_data;
Example: we want to find cities where add campaign is particularly effective.
We use the with ... construct and calculate a,b, and sigma as following:
regr_slope() as a
regr_icpt() as b
sqrt((regr_syy() - (regr_sxy()*regr_sxy()/regr_sxx()))/regr_count() - 2)) as sigma
where all regr functions have 2 arguments: (sales, ad_budget)
Now we can select ... where sales > a*ad_budget + b + 2e0*sigma
Same regression functions can also be used to computer other statistical data, like F-statistics.
To compare 2 sets and verify a hypothesis - people have always used t-test (which presumes normal distribution).
Or more modern procedure called Wilcoxon Rank Test which avoids above restrictions.
It is calculated using the rank() function ( or dense_rank() function for dealing with duplicate totals)
Here are how ranks are calculated:
with
ranked_sales(city,ranks) as (
select city, rank( ) over(order by sales) from feb_sales
)
select sum(ranks) as W from ranked_sales where city = 'B'
Then one needs tables from some statistics books to compute theresult.
----- Removing duplicates:
create view aaatemp(rn) as select rownumber() over(partition by col1 || '_' || col2) from mytab;
delete from aaatemp where rn > 1;
drop view aaatemp;
Note: the view doesn't have any columns of the original table. Still deleting from the view does the deletion from the original table
----- Finding duplicates without creating a view:
with
mytemp(name, desc_text, rn) as (
select name, desc_text, rownumber() over(partition by desc_text) as rn
from s_org_ext where cust_stat_cd !='Deleted'
)
select * from mytemp where rn > 1
----- Delete 100 rows at a time by hand:
create view aaatemp(rn) as select rownumber() over() as rn from mytab where ....;
delete from aaatemp where rn < 100;
----- Very-very slow way to go 10 at a time:
select a.row_id from yourtable a
where 10 > (select count(*) from youtable b where a.row_id < b.row_id)
order by row_id desc
----- Paging through table:
select myname
from (select myname, rownumber() over(order by myname) as rn from mytable) as tr
where rn between 10000 and 10020
----- rank:
select name, dept, salary, rank() over(partition by dept order by salary desc) as salary rank
from staff
order by dept, salary_rank
----- moving average:
select sales_date, avg(sales) over(order by sales_date rows between 1 preceding and 1 following) as smoothed_sales
from sales
where sales_date > '03/28/1996' and sales_person = 'LEE' and region =
"some-region'
----- grouping sets:
select year(sales_date) as year, region, sales_person, count(*) as tot_sales
from sales
group by grouping sets (year(sales_date), region, sales_person, ( ))
order by year(sales_date), region, sales_person
页:
[1]