Procházet zdrojové kódy

MOD:dw_cost_image_day 数据进行处理

cxyu před 3 roky
rodič
revize
bfe24a3fb3

+ 17 - 3
app/etl/MaterialLibrary/MaterialDataClean.py

@@ -52,17 +52,31 @@ def description():
 
 
 def image():
-    sql="""select signature,
+    sql="""select signature,sum(consume_amount) consume_amount,
+            sum(click_times) click_times,
+            sum(view_times) view_times,
+            group_concat(distinct novels) novels ,
+            max(end_date) end_date,
+            min(start_date) start_date,
+            min(content) content,
+            min(type) type,  
+            if(locate(',',signature)>0,0,1) single_img
+        from (select replace(signature,' ,','') as signature ,
             sum(cost) consume_amount,
             sum(click_count) click_times,
             sum(view_count) view_times,
             group_concat(distinct book) novels ,
             max(dt) end_date,
             min(dt) start_date,
-            min(preview_url) content,
+            replace (min(preview_url),' ,','') as content,
             if(is_video=1,2,1) type,  
             if(locate(',',signature)>0,0,1) single_img
-            from dw_image_cost_day  where signature is not null and signature !=''  GROUP BY  signature,is_video"""
+            from dw_image_cost_day  
+            where signature is not null and signature !=''  
+            and length (replace (replace (signature,',',''),' ',''))>0
+            GROUP BY  signature,is_video) as foo
+            group by signature  
+            """
 
     df = db.dm.getData_pd(sql)
     # print(df)

+ 17 - 8
app/etl/dw/dw_image_cost_day.py

@@ -34,7 +34,8 @@ def run(dt):
     for i in data:
         # print(i)
         li.extend(i[-1].split(','))
-    #TODO:之后如果一天产生的图片过多,可能超过sql的字符限制
+    #TODO:之后如果一天产生的图片过多,可能超过sql的字符限制,
+    # 之后数据使用hive,来进行数据存储
 
     sql3 = f"select image_id,preview_url,signature,width,height from image_info where  image_id in ({str(set(li))[1:-1]})"
 
@@ -46,23 +47,31 @@ def run(dt):
     # print(image_di)
 
     for i in data:
-        y = ''
-        z = ''
+        preview_url = ''
+        signature = ''
         width = ''
         height = ''
+        image_id = ''
         for j in i[-1].split(','):
             if image_di.get(j):
-                y = y + ',' + image_di.get(j)[0]
-                z = z + ',' + image_di.get(j)[1]
+                image_id = image_id + ',' + j
+                preview_url = preview_url + ',' + image_di.get(j)[0]
+                signature = signature + ',' + image_di.get(j)[1]
                 width = width + ',' + str(image_di.get(j)[2])
                 height = height + ',' + str(image_di.get(j)[3])
-        i.append(y[1:])
-        i.append(z[1:])
+            else:
+                image_id = image_id + ',' +j
+                preview_url = preview_url + ',' + ' '
+                signature = signature + ',' + ' '
+                width = width + ',' + '0'
+                height = height + ',' + '0'
+        i[-1]=image_id[1:]
+        i.append(preview_url[1:])
+        i.append(signature[1:])
         i.append(0)
         i.append(width[1:])
         i.append(height[1:])
 
-    # print(data)
     # exit(0)
     sql_video = f"""SELECT a.dt,b.type,sum(a.cost),sum(view_count),sum(click_count),sum(follow_count),sum(order_count),sum(order_amount),
             title,description,book,platform,stage,e.channel,pitcher,ifnull(image_id,''),g.preview_url,g.signature,1,