""" desc : 素材库数据清洗 每日运行一次 """ from model.DataBaseUtils import MysqlUtils db = MysqlUtils() def title(): sql = """select REPLACE(REPLACE(title, CHAR(10), ''), CHAR(13), '') content, cast(sum(cost) as float) consume_amount, sum(click_count) click_times, sum(view_count) view_times, cast(count(*) as decimal(10,2)) use_times, group_concat(distinct book) novels, max(dt) end_date,min(dt) start_date from dw_image_cost_day where title!='' and title is not null GROUP BY REPLACE(REPLACE(title, CHAR(10), ''), CHAR(13), '') """ df = db.dm.pd_data_sql(sql) df["data_type"] = 'all' df['type'] = '1' df['create_by'] = '0' key = ["content", "type"] tag = ["view_times", "click_times", "novels", "start_date", "end_date", "create_by", 'data_type', 'consume_amount'] table = "t_ads_content" db.zx_ads.dfsave2mysql(df, table, key, tag) def description(): sql = """select REPLACE(REPLACE(description, CHAR(10), ''), CHAR(13), '') content, sum(cost) consume_amount, sum(click_count) click_times, sum(view_count) view_times, cast(count(*) as decimal(10,2)) use_times, group_concat(distinct book) novels, max(dt) end_date,min(dt) start_date from dw_image_cost_day where description!='' and description is not null GROUP BY REPLACE(REPLACE(description, CHAR(10), ''), CHAR(13), '') """ df = db.dm.pd_data_sql(sql) df["data_type"] = 'all' df['type'] = '2' df['create_by'] = '0' key = ["content", "type"] tag = ['use_times', "view_times", "click_times", "novels", "start_date", "end_date", "create_by", 'data_type', 'consume_amount'] table = "t_ads_content" db.zx_ads.dfsave2mysql(df, table, key, tag) def image(): # TODO:视频,图片新添加的列没有归并到media,idea列里面,在dw_image_cost_day也没有进行对应归并 sql = """select signature,sum(consume_amount) consume_amount, sum(click_times) click_times, sum(view_times) view_times, sum(use_times) use_times, group_concat(distinct novels) novels , max(end_date) end_date, min(start_date) start_date, min(content) content, min(type) type, if(locate(',',signature)>0,0,1) single_img, min(width ) width , min(height ) height , min(media_size) media_size , min(media_format) media_format, min(video_length) video_length, min(video_bit_rate) video_bit_rate, 0 max_media_size, min(width)/if(min(height),min(height),1) aspect_ratio from (select replace(signature,' ,','') as signature , sum(cost) consume_amount, sum(click_count) click_times, sum(use_times) use_times, sum(view_count) view_times, group_concat(distinct book) novels , max(dt) end_date, min(dt) start_date, replace (min(preview_url),' ,','') as content, if(is_video=1,2,1) type, if(locate(',',signature)>0,0,1) single_img, if(min(width)>0,min(width),0) width, if(min(height)>0,min(height),0) height, min(replace(if(left (size ,2)='0,',substring(size ,3),size) ,',0','')) media_size, min(replace(format ,' ,','')) media_format, if(min(video_length)>0,min(video_length),0) video_length, if(min(video_bit_rate)>0,min(video_bit_rate),0) video_bit_rate from dw_image_cost_day where signature is not null and signature !='' and length (replace (replace (signature,',',''),' ',''))>0 GROUP BY signature,is_video) as foo group by signature """ # df = db.dm.getData_pd(sql) df = db.dm.pd_data_sql(sql) # print(df) # 进行数据转换-----添加max_media_size for i in range(len(df['media_size'])): if not df['media_size'][i]: continue size_list = df['media_size'][i].split(',') max_size = 0 for size_data in size_list: if size_data != 'None': if float(size_data) > float(max_size): max_size = str(size_data) df['max_media_size'][i] = max_size df['create_by'] = '0' df["data_type"] = 'all' key = ["signature"] tag = ['media_size', 'media_format', 'video_length', 'video_bit_rate', 'max_media_size', 'use_times', "view_times", "click_times", "novels", "start_date", "end_date", "create_by", "single_img", "content", 'consume_amount', 'type', 'width', 'height', 'aspect_ratio'] table = "t_ads_media" db.zx_ads.dfsave2mysql(df, table, key, tag) def adcreative(): sql = """select signature,title,article, sum(click_times) click_times, sum(view_times) view_times, sum(use_times) use_times, sum(consume_amount) consume_amount, group_concat(distinct novels) novels, min(start_date) start_date, max(end_date) end_date, min(media) media, min(channel) channel , min(type) type, if(locate(',',signature)>0,0,1) single_img, min(width) width, min(height) height, min(media_size) media_size , min(media_format) media_format, min(video_length) video_length, min(video_bit_rate) video_bit_rate, 0 max_media_size, min(width)/if(min(height),min(height),1) aspect_ratio from (select replace(signature ,' ,','') as signature,title,description article, sum(click_count) click_times, sum(view_count) view_times, sum(use_times) use_times, sum(cost) consume_amount, group_concat(distinct book) novels, min(dt) start_date,max(dt) end_date, min(replace(preview_url ,' ,','')) media, if(min(width)>0,min(width),0) width, if(min(height)>0,min(height),0) height, min(replace(if(left (size ,2)='0,',substring(size ,3),size) ,',0','')) media_size , min(replace(format ,' ,','')) media_format, if(min(video_length)>0,min(video_length),0) video_length, if(min(video_bit_rate)>0,min(video_bit_rate),0) video_bit_rate, type channel, if(is_video=1,2,1) type, if(locate(',',signature)>0,0,1) single_img from dw_image_cost_day where signature is not null and signature!='' GROUP BY signature,title,description,type,is_video) as foo group by signature ,title ,article """ # df = db.dm.getData_pd(sql) df = db.dm.pd_data_sql(sql) # 进行数据转换-----添加max_media_size for i in range(len(df['media_size'])): if not df['media_size'][i]: continue size_list = df['media_size'][i].split(',') max_size = 0 for size_data in size_list: if size_data != 'None': if float(size_data) > float(max_size): max_size = str(size_data) df['max_media_size'][i] = max_size key = ["signature", 'title', 'article'] tag = ['media_size', 'media_format', 'video_length', 'video_bit_rate', 'max_media_size', 'use_times', "view_times", "click_times", "novels", "start_date", "end_date", "type", "channel", 'consume_amount', 'single_img', 'media', 'width', 'height', 'aspect_ratio'] table = "t_ads_idea" db.zx_ads.dfsave2mysql(df, table, key, tag) # TODO:线上,线下视频进行归并 # 同一个signature选择同一个 def run(): title() description() image() adcreative() if __name__ == '__main__': run() # title() # description() # image() # adcreative()