MaterialDataClean.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. """
  2. desc : 素材库数据清洗
  3. 每日运行一次
  4. """
  5. from model.DataBaseUtils import MysqlUtils
  6. db = MysqlUtils()
  7. def title():
  8. sql = """select REPLACE(REPLACE(title, CHAR(10), ''), CHAR(13), '') content,
  9. cast(sum(cost) as float) consume_amount,
  10. sum(click_count) click_times,
  11. sum(view_count) view_times,
  12. cast(count(*) as decimal(10,2)) use_times,
  13. group_concat(distinct book) novels,
  14. max(dt) end_date,min(dt) start_date
  15. from dw_image_cost_day where title!='' and title is not null GROUP BY REPLACE(REPLACE(title, CHAR(10), ''), CHAR(13), '')
  16. """
  17. df = db.dm.pd_data_sql(sql)
  18. df["data_type"] = 'all'
  19. df['type'] = '1'
  20. df['create_by'] = '0'
  21. key = ["content", "type"]
  22. tag = ["view_times", "click_times", "novels", "start_date", "end_date", "create_by", 'data_type',
  23. 'consume_amount']
  24. table = "t_ads_content"
  25. db.zx_ads.dfsave2mysql(df, table, key, tag)
  26. def description():
  27. sql = """select REPLACE(REPLACE(description, CHAR(10), ''), CHAR(13), '') content,
  28. sum(cost) consume_amount,
  29. sum(click_count) click_times,
  30. sum(view_count) view_times,
  31. cast(count(*) as decimal(10,2)) use_times,
  32. group_concat(distinct book) novels,
  33. max(dt) end_date,min(dt) start_date
  34. from dw_image_cost_day where description!='' and description is not null GROUP BY REPLACE(REPLACE(description, CHAR(10), ''), CHAR(13), '')
  35. """
  36. df = db.dm.pd_data_sql(sql)
  37. df["data_type"] = 'all'
  38. df['type'] = '2'
  39. df['create_by'] = '0'
  40. key = ["content", "type"]
  41. tag = ['use_times', "view_times", "click_times", "novels", "start_date", "end_date", "create_by", 'data_type',
  42. 'consume_amount']
  43. table = "t_ads_content"
  44. db.zx_ads.dfsave2mysql(df, table, key, tag)
  45. def image():
  46. # TODO:视频,图片新添加的列没有归并到media,idea列里面,在dw_image_cost_day也没有进行对应归并
  47. sql = """select signature,sum(consume_amount) consume_amount,
  48. sum(click_times) click_times,
  49. sum(view_times) view_times,
  50. sum(use_times) use_times,
  51. group_concat(distinct novels) novels ,
  52. max(end_date) end_date,
  53. min(start_date) start_date,
  54. min(content) content,
  55. min(type) type,
  56. if(locate(',',signature)>0,0,1) single_img,
  57. min(width ) width ,
  58. min(height ) height ,
  59. min(media_size) media_size ,
  60. min(media_format) media_format,
  61. min(video_length) video_length,
  62. min(video_bit_rate) video_bit_rate,
  63. 0 max_media_size,
  64. min(width)/if(min(height),min(height),1) aspect_ratio
  65. from (select replace(signature,' ,','') as signature ,
  66. sum(cost) consume_amount,
  67. sum(click_count) click_times,
  68. sum(use_times) use_times,
  69. sum(view_count) view_times,
  70. group_concat(distinct book) novels ,
  71. max(dt) end_date,
  72. min(dt) start_date,
  73. replace (min(preview_url),' ,','') as content,
  74. if(is_video=1,2,1) type,
  75. if(locate(',',signature)>0,0,1) single_img,
  76. if(min(width)>0,min(width),0) width,
  77. if(min(height)>0,min(height),0) height,
  78. min(replace(if(left (size ,2)='0,',substring(size ,3),size) ,',0','')) media_size,
  79. min(replace(format ,' ,','')) media_format,
  80. if(min(video_length)>0,min(video_length),0) video_length,
  81. if(min(video_bit_rate)>0,min(video_bit_rate),0) video_bit_rate
  82. from dw_image_cost_day
  83. where signature is not null and signature !=''
  84. and length (replace (replace (signature,',',''),' ',''))>0
  85. GROUP BY signature,is_video) as foo
  86. group by signature
  87. """
  88. # df = db.dm.getData_pd(sql)
  89. df = db.dm.pd_data_sql(sql)
  90. # print(df)
  91. # 进行数据转换-----添加max_media_size
  92. for i in range(len(df['media_size'])):
  93. if not df['media_size'][i]:
  94. continue
  95. size_list = df['media_size'][i].split(',')
  96. max_size = 0
  97. for size_data in size_list:
  98. if size_data != 'None':
  99. if float(size_data) > float(max_size):
  100. max_size = str(size_data)
  101. df['max_media_size'][i] = max_size
  102. df['create_by'] = '0'
  103. df["data_type"] = 'all'
  104. key = ["signature"]
  105. tag = ['media_size', 'media_format', 'video_length', 'video_bit_rate',
  106. 'max_media_size', 'use_times', "view_times",
  107. "click_times",
  108. "novels", "start_date", "end_date", "create_by", "single_img",
  109. "content", 'consume_amount', 'type', 'width', 'height', 'aspect_ratio']
  110. table = "t_ads_media"
  111. db.zx_ads.dfsave2mysql(df, table, key, tag)
  112. def adcreative():
  113. sql = """select signature,title,article,
  114. sum(click_times) click_times,
  115. sum(view_times) view_times,
  116. sum(use_times) use_times,
  117. sum(consume_amount) consume_amount,
  118. group_concat(distinct novels) novels,
  119. min(start_date) start_date,
  120. max(end_date) end_date,
  121. min(media) media,
  122. min(channel) channel ,
  123. min(type) type,
  124. if(locate(',',signature)>0,0,1) single_img,
  125. min(width) width,
  126. min(height) height,
  127. min(media_size) media_size ,
  128. min(media_format) media_format,
  129. min(video_length) video_length,
  130. min(video_bit_rate) video_bit_rate,
  131. 0 max_media_size,
  132. min(width)/if(min(height),min(height),1) aspect_ratio
  133. from
  134. (select replace(signature ,' ,','') as signature,title,description article,
  135. sum(click_count) click_times,
  136. sum(view_count) view_times,
  137. sum(use_times) use_times,
  138. sum(cost) consume_amount,
  139. group_concat(distinct book) novels,
  140. min(dt) start_date,max(dt) end_date,
  141. min(replace(preview_url ,' ,','')) media,
  142. if(min(width)>0,min(width),0) width,
  143. if(min(height)>0,min(height),0) height,
  144. min(replace(if(left (size ,2)='0,',substring(size ,3),size) ,',0','')) media_size ,
  145. min(replace(format ,' ,','')) media_format,
  146. if(min(video_length)>0,min(video_length),0) video_length,
  147. if(min(video_bit_rate)>0,min(video_bit_rate),0) video_bit_rate,
  148. type channel,
  149. if(is_video=1,2,1) type,
  150. if(locate(',',signature)>0,0,1) single_img
  151. from dw_image_cost_day where signature is not null and signature!=''
  152. GROUP BY signature,title,description,type,is_video) as foo
  153. group by signature ,title ,article
  154. """
  155. # df = db.dm.getData_pd(sql)
  156. df = db.dm.pd_data_sql(sql)
  157. # 进行数据转换-----添加max_media_size
  158. for i in range(len(df['media_size'])):
  159. if not df['media_size'][i]:
  160. continue
  161. size_list = df['media_size'][i].split(',')
  162. max_size = 0
  163. for size_data in size_list:
  164. if size_data != 'None':
  165. if float(size_data) > float(max_size):
  166. max_size = str(size_data)
  167. df['max_media_size'][i] = max_size
  168. key = ["signature", 'title', 'article']
  169. tag = ['media_size', 'media_format', 'video_length', 'video_bit_rate', 'max_media_size',
  170. 'use_times', "view_times",
  171. "click_times", "novels", "start_date", "end_date", "type", "channel",
  172. 'consume_amount', 'single_img', 'media', 'width', 'height', 'aspect_ratio']
  173. table = "t_ads_idea"
  174. db.zx_ads.dfsave2mysql(df, table, key, tag)
  175. # TODO:线上,线下视频进行归并
  176. # 同一个signature选择同一个
  177. def run():
  178. title()
  179. description()
  180. image()
  181. adcreative()
  182. if __name__ == '__main__':
  183. run()
  184. # title()
  185. # description()
  186. # image()
  187. # adcreative()