hive脚本笔记

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332

在create table 时,添加STORED AS ORC设置
在insert 前,添加set hive.default.fileformat=Orc;设置

hive -e "
drop table if exists uid_mid_cat_once;
create table uid_mid_cat_once as
select uid, count(distinct(cat)) as cat_num from
(select clk.uid, clk.mid, split(t2.category,',')[0] as cat from
(select dt,mid,uid,push_channel_code
from multi_push_is_click where dt=${date} and push_channel_code in ('10000216bjc','10001003','10001167','10001012'))clk
join
(select uid from gh_user_feature_daily where usertype='uid' and dt='$date' and freq in ('4','5') and opened_60='1')gh
on clk.uid=gh.uid
join
(select mid,category from
(select mid, new_to_ids from qiuyu_interest_material_static_info where dt='$date')b
lateral view explode(split(new_to_ids,',')) table_view
as category group by mid,category)t2
on clk.mid=t2.mid
"

hive -e"select count(distinct(uid)) as uid_num, count(distinct(mid)) as mid_num, count(distinct(cat)) as cat_num from uid_mid_cat where length(uid)>=8 and (substring(uid,-2,1)='7')
and (substring(uid,-6,1)='3' or substring(uid,-6,1)='4')" > num_cmp

#统计4组月活用户下发数
function join_tid_cat_luicode(){
1. hadoop fs -cat /user_ext/weibo_bigdata_push/xiaoye5/UserInfoStoreTask/dt=20211229/part-00999 | grep '"mfreq":"h"' | awk '{print $1}' > tmp.txt
2. hive -e " create table zhipengtest (uid, string)"
3. load data local inpatn 'tmp.txt' overwrite into table zhipeng_yuehuo_user
hive -e "
select count(distinct(clk.mid)) as send from
(select * from zhipeng_yuehuo_user where length(uid)>=8 and (substring(uid,-2,1)='7')
and (substring(uid,-6,1)='7' or substring(uid,-6,1)='8'))zhipeng
join
(select dt,label,uid,push_channel_code,mid
from multi_push_is_click where dt='20211229' and push_channel_code in ('10000216bjc','10001003','10001167','10001012'))clk
on zhipeng.uid=clk.uid" > tmp.txt
}

#统计用户纬度热搜下发三级标签和兴趣下发的重复程度
function join_tid_cat_luicode(){
hive -e "
drop table if exists uid_mid_cat_rate_once;
create table uid_mid_cat_rate_once as
select tmp1.uid, tmp2.dupli_num, tmp1.all_num, ROUND(tmp2.dupli_num/COALESCE(CAST(tmp1.all_num AS DOUBLE), 0), 2) as rate from
(select uid, count(distinct(mid)) as all_num from uid_mid_cat where uid>0 group by uid)tmp1
join(
select clk.uid as uid, count(distinct(t1.mid)) as dupli_num from
(select mid, cat from
(select mid, category_ids from tmp_tid_cat_luicode where dt=${date})tmp
lateral view explode(split(category_ids,',')) table_view as cat group by mid, cat)t1
join
(select dt,mid,uid,push_channel_code from multi_push_is_click where dt=${date} and push_channel_code in ('10001179'))clk
on t1.mid=clk.mid
join
(select uid,mid,cat from
uid_mid_cat
)t2
on clk.uid=t2.uid and t1.cat=t2.cat
group by clk.uid
)tmp2
on tmp1.uid=tmp2.uid
"
}


function join(){
hive -e "
select xxx from
(select xx)tmp1
join
(select from
(select mid, cat_ids)tmp lateral view explode table_view as cat
)t1
join
(select)clk
on
"
}

#统计兴趣uid mid 下发mid三级标签
function create_uid_mid_cat(){
hive -e "
select clk.uid, clk.mid, split(t2.category,',')[0] as cat from
(select dt,mid,uid,push_channel_code
from multi_push_is_click where dt=${date} and push_channel_code in ('10000216bjc','10001003','10001167','10001012'))clk
join
(select mid,category from
(select mid, new_to_ids from qiuyu_interest_material_static_info where dt='$date')b
lateral view explode(split(new_to_ids,',')) table_view
as category group by mid,category)t2
on clk.mid=t2.mid
"
}

function interest_push_ids() {
hive -e"
select mid, t.uid, s.ids from
(select mid, new_to_ids as ids from qiuyu_interest_material_static_info where dt=${date})s
join
(select dt,mid,uid,label,push_channel_code
from multi_push_is_click where dt=${date} and push_channel_code in ('10000216bjc','10001003','10001167','10001012'))t
on s.mid=t.mid
group by t.uid" > mining/data/highClick_blogger/interest_push_ids
}

function join_data(){
hive -e"
drop table if exists $tbR;
create table $tbR as
select a.uid,
c.nick, b.vuser_type_id,b.real_name, b.cert_info, b.apply_reason,b.vuser_type,b.group_name1, b.group_name2, b.class_name1, b.class_name2, b.ex_group_id, b.ex_adminid, b.distributor, b.cert_type,d.last_time,f.original_num_30_days,f.original_num_7_days from
(select * from $tbN)a
left join
(select * from ods_user_v_info_ex where dt='$dt' )b
on a.uid=b.uid
left join
(select * from ods_user_base_info where dt='$dt' )c
on a.uid=c.uid
left join
(select * from hzt_push_mbloguid_last_mid_createtime_origin_30days where dt='20211129')d
on a.uid=d.mblog_uid
left join
(select * from hzt_push_mbloguid_original_num_7and30days where dt='20211129')f
on a.uid=f.mblog_uid;
"
}

date=`date -d "1 days ago" +%Y%m%d`
left_date=`date -d "7 days ago $date" +%Y%m%d`
right_date=`date -d "0 days ago $date" +%Y%m%d`
echo $left_date
echo $right_date
function highReduBlogger() {
hive -e"
select fans.uid, redu from
(SELECT uid,filtered_fans_num FROM mds_user_relation_sum WHERE dt='${right_date}' AND filtered_fans_num >= 100000)fans
join
(select uid, redu from
( select uid, sum(traned_cnt)+sum(cmted_cnt)/sum(orig_cnt) as redu from mds_bhv_tblog_day where dt>${left_date} and dt<=${right_date}
and app_class_code='10000' and is_weibo='10000' group by uid)tmp where redu>5
)mds
on fans.uid=mds.uid
sort by redu desc" > mining/data/highClick_blogger/redu_top_blogger_7days_all
}
highReduBlogger

date=`date -d "5 days ago" +%Y%m%d`
end_dt=`date +%Y%m%d`
while [ "$date" != "$end_dt" ]
do
echo $date+ >> dateIconClick
big_icon
let date=`date -d "-1 days ago $date" +%Y%m%d`
done


mds_bhv_tblog_day

date=`date -d "0 days ago $date" +%Y%m%d`
date3=`date -d "3 days ago $date" +%Y%m%d`
function highFansBlogger() {
hive -e"
select count(clk.label) as send, sum(if(label='1',1,0)) as click, blogger.uid as bid from
(select fans.uid, mid from
(SELECT uid,filtered_fans_num FROM mds_user_relation_sum WHERE dt='${dt_fans}' AND filtered_fans_num >= 100000)fans
right join
( select mid,uid,dt from ods_tblog_content where dt>${date3} and dt<=${date})ods
on fans.uid=ods.uid
)blogger
right join
(select dt,mid,uid,label,push_channel_code
from multi_push_is_click where dt=${date} and push_channel_code in ('10000323','10000752'))clk
on blogger.mid=clk.mid
group by blogger.uid sort by click desc" > mining/data/highClick_blogger/high_fans_blogger_7days_800
}

function highClickBlogger800() {
hive -e"
select count(t.label) as send, sum(if(label='1',1,0)) as click, s.bid as bid from
(select mid, uid as bid from qiuyu_interest_material_static_info where dt=${date})s
right join
(select dt,mid,uid,label,push_channel_code
from multi_push_is_click where dt>${date} and push_channel_code in ('10000216bjc','10001003','10001167','10001012'))t
on s.mid=t.mid
group by s.bid sort by click desc limit ${limit}" > mining/data/highClick_blogger/click_top_blogger_120days_800
}
highClickBlogger800

#hive -e "select count(mid), bid from zhipeng_b group by bid" > 813log
date='20210820'
function big_icon() {
hive -e "select count(b.label) as send_pv, sum(if(label='1',1,0)) as click_pv,
count(distinct(b.uid)) as send_uv, count(distinct case when b.label='1' then b.uid else NULL end) as click_uv,
count(distinct(b.uid)) as all_icon_user, count(distinct(b.mid)) as all_icon_mid
from
(select uid,mid from
(select uid,split(lfid,'_')[1] as mid from ods_wls_push_mps_send where dt='$date' and ext like '%bigPic%')tmp
group by uid,mid)a
join
(select mid, uid,label from multi_push_is_click where dt='$date' and push_channel_code in ('10000216bjc','10001003','10001012','10001167'))b
on a.uid=b.uid and a.mid=b.mid" >> dateIconClick
}

function big_iconMI() {
hive -e "select count(t.label) as send_pv, sum(if(label='1',1,0)) as click_pv,
count(distinct(t.uid)) as send_uv, count(distinct case when t.label='1' then t.uid else NULL end) as click_uv,
count(distinct(t.uid)) as all_icon_user, count(distinct(t.mid)) as all_icon_mid
from
(select distinct split(lfid,'_')[1] as mid from ods_wls_push_mps_send where dt='$date' and ext like '%bigPic%')a
join(
select b.mid as mid,b.uid as uid,b.label as label
from(
select mid, uid,label from multi_push_is_click where dt='$date' and push_channel_code in ('10000216bjc','10001003','10001012','10001167')
)b

join(
select distinct uid,split(lfid,'_')[1] as mid from ods_wls_push_mps_send where dt='$date' and ext like '%xiaomi%'
)m
on b.mid=m.mid and b.uid=m.uid
)t
on a.mid=t.mid" >> dateIconClick
}

function big_Nicon() {
hive -e "select count(b.label) as send_pv, sum(if(label='1',1,0)) as click_pv,
count(distinct(b.uid)) as send_uv, count(distinct case when b.label='1' then b.uid else NULL end) as click_uv,
count(distinct(b.uid)) as all_icon_user, count(distinct(b.mid)) as all_icon_mid
from
(select uid,mid
(select uid,split(lfid,'_')[1] as mid from ods_wls_push_mps_send where dt='20210820' and ext not like '%bigPic%')tmp
group by mid,uid)a
join
(select mid, uid,label from multi_push_is_click where dt='20210820' and push_channel_code in ('10000216bjc','10001003','10001012','10001167')
)b
on a.uid=b.uid and a.mid=b.mid" > dataClick
}

function insert_interest_data(){
hive -e"
select t2.category,t2.ruku_date,count(t.label) as send_pv,sum(if(label='1',1,0)) as click_pv,count(distinct t2.mid) as mid_num, t.push_channel_code
from
(select dt,mid,uid,label,push_channel_code
from multi_push_is_click where dt='$dt' and push_channel_code in ('10000216bjc','10001003','10001167','10001012'))t
left join
(select mid,ruku_date,category from
(select mid,if(from_unixtime(int(ruku_time),'yyyyMMdd')='$dt','1','0') as ruku_date,new_class_level1 from qiuyu_interest_material_static_info where dt='$dt')b
lateral view explode(split(new_class_level1,',')) table_view
as category group by mid,category,ruku_date)t2
on t.mid=t2.mid
group by t.dt,t2.category,t2.ruku_date, t.push_channel_code
"> new_category_send/$dt
}
insert_interest_data



function highActionCodeBlogger(){
hive -e "
set hive.groupby.skewindata=true;
set mapred.reduce.tasks=200;

drop table if exists tmp_push_interest_read_blogger_daily_mengdi7;
create table tmp_push_interest_read_blogger_daily_mengdi7 row format delimited fields terminated by '\t' as
select t.uid as uid, t.mid as mid, s.bid as bid, t.act_time as act_time, t.act_codes as act_codes, t.luicode as luicode
from (
select mid, uid as bid from qiuyu_interest_material_static_info where dt=${date}
) s
join (
select a.uid as uid, a.mid as mid, a.act_time as act_time, b.act_codes as act_codes, a.luicode as luicode
from (
select uid, mid, luicode, min(act_time) as act_time from push_interest_untimely_click_datas_mengdi7 where dt=${date}
group by uid, mid, luicode
) a
join (
select uid, mid, act_codes from push_interest_untimely_only_action_datas_mengdi7 where dt=${date}
) b
on a.uid=b.uid and a.mid=b.mid
) t
on s.mid=t.mid
;
"
}




#function insert_interest_data(){
#hive -e"
#select t.category,t.ruku_date,sum(t.send) as send_num,sum(t.click) as click_num,count(mid) as mid_num from
#(select t1.category,t1.ruku_date,t1.send,t2.click,t1.mid from
#(select a1.mid,a1.send,a2.category,a2.ruku_date from
# (select a.mid,count(distinct a.uid)[多少个uid就是多少send_uv as] send from
# (select split(lfid,'_')[1] as mid,uid from ods_wls_push_mps_send
# where dt='$dt' and length(uid)<=13 and ((substring(luicode,0,8)='10000216' and substring(sid,0,11)='10000216bjc') or (substring(luicode,0,8) in ('10001003','10001167','10001012'))))a group by a.mid
# )a1
# left join
# (select mid,ruku_date,category from
# (select mid,if(from_unixtime(int(ruku_time),'yyyyMMdd')='$dt',1,0) as ruku_date,new_class_level1 from qiuyu_interest_material_static_info where dt='$dt')b
# lateral view explode(split(new_class_level1,',')) table_view
# as category group by mid,category,ruku_date)a2
# on a1.mid=a2.mid
# )t1
#left join
#(select a2.mid,count(distinct a2.uid) as click from (select split(previous_id,'_')[1] as mid,uid from mds_wls_encode_bhv where dt='$dt'
#and (action in ('7','26','97','118','159','169','191','320','337','1246','496','3156','2481','2507','317','1332','1555','104'))
#and length(uid)<=13
#and ((previous_uicode='10000216' and substring(previous_id,1,1)='1') or
#previous_uicode in ('10001003','10001167','10001012')))a2 group by a2.mid)t2
#on t1.mid=t2.mid)t
#group by t.category,t.ruku_date;
#"> category_send/$dt
#}

function insert_interest_data(){
hive -e"
select t2.category,t2.ruku_date,count(distinct t.uid) as send_uv,
count(distinct case when t.label='1' then t.uid else NULL end) as click_uv, count(distinct t2.mid) as mid_num, t.push_channel_code
from
(select dt,mid,uid,label,push_channel_code
from multi_push_is_click where dt='$dt' and push_channel_code in ('10000216bjc','10001003','10001167','10001012'))t
left join
(select mid,ruku_date,category from
(select mid,if(from_unixtime(int(ruku_time),'yyyyMMdd')='$dt','1','0') as ruku_date,new_class_level1 from qiuyu_interest_material_static_info where dt='$dt')b
lateral view explode(split(new_class_level1,',')) table_view
as category group by mid,category,ruku_date)t2
on t.mid=t2.mid
group by t.dt,t2.category,t2.ruku_date, t.push_channel_code
"> new_category_send/$dt
}
insert_interest_data

同步最近兴趣自采物料相关工作:

1、兴趣120天内高点击量top400博主自采供稿

​ 挖掘方式:取120天内的点击行为日志表multi_push_is_click的兴趣物料点击情况,join静态信息表中的发博博主,group by博主列倒排点击数,得到120天内高点击量top400博主

2、关系**10w粉**7天内点击>5所有博主自采供稿

​ 挖掘方式:取7天内的点击行为日志表multi_push_is_click的关系物料点击情况,join关系博主信息表中的发博博主,group by博主列倒排点击数,得到120天内高点击量top400博主

3、兴趣120天内高点击量top400博主自采供稿

​ 挖掘方式:取120天内的点击行为日志表multi_push_is_click的兴趣物料点击情况,join静态信息表中的

​ 发博博主,group by博主列倒排点击数,得到120天内高点击量top400博主