hive 抽样统计
- - SQL - 编程语言 - ITeye博客本文转自http://www.taobaotesting.com/blogs/2468,原文分层抽样的逻辑不是很清楚,按照自己的想法重新实现个. 用hive实现了随机抽样中简单随机、系统和分层抽样的方式,记得抽样的概念还是初中数据接触的. 其实很多时候不需要理论,想也是可以想到的,不过还是总结一下.
drop table songpo_test; create table if not exists songpo_test ( refund_id string, user_id string, cat_id string, cat2_id string, org_id string, gmt_create string ) partitioned by(pt string) row format delimited fields terminated by ',' lines terminated by '\n' STORED AS SEQUENCEFILE;
sql: select * from( select user_id,flag from (select user_id,'1' as flag from songpo_test) x distribute by user_id sort by user_id,flag desc )a where row_number(user_id)<=100;
sql: select * from( select refund_id,user_id,mod_num,rank_num from (select refund_id,user_id,cast(10+rand()*100 as double) rank_num,user_id%5 as mod_num from songpo_test) distribute by mod_num sort by mod_num,rank_num desc )a where row_number(mod_num)<=20;
drop table test_data_extra_indexs; create table test_data_extra_indexs as select a.cat_id,cat_num,all_num,cat_num/all_num as extra_lv,(cat_num/all_num)*'EXTRA_NUM' as cat_extra_num,c.refund_id,c.user_id,c.org_id from (select cat_id,count(1) as cat_num,'1' as key from songpo_test group by cat_id) a join (select '1' as key,count(1) as all_num from songpo_test) b on a.key=b.key join (select * from songpo_test) c on a.cat_id=c.cat_id; select * from( select refund_id,user_id,cat_id,mod_num,rank_num from select refund_id,user_id,cat_id,cast(10+rand()*100 as double) rank_num,user_id%5 as mod_num,cat_extra_num from( (select refund_id,user_id,cat_id,cast(10+rand()*100 as double) rank_num,user_id%5 as mod_num from test_data_extra_indexs) x ) distribute by mod_num sort by mod_num,rank_num desc )a where row_number(mod_num)<=20;