|
@@ -16,7 +16,11 @@ import org.apache.commons.lang3.StringUtils;
|
|
|
import org.apache.flink.api.common.eventtime.*;
|
|
|
import org.apache.flink.configuration.Configuration;
|
|
|
import org.apache.flink.connector.kafka.source.KafkaSource;
|
|
|
+import org.apache.flink.contrib.streaming.state.ConfigurableRocksDBOptionsFactory;
|
|
|
+import org.apache.flink.contrib.streaming.state.DefaultConfigurableOptionsFactory;
|
|
|
import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend;
|
|
|
+import org.apache.flink.contrib.streaming.state.PredefinedOptions;
|
|
|
+import org.apache.flink.runtime.state.StateBackend;
|
|
|
import org.apache.flink.streaming.api.CheckpointingMode;
|
|
|
import org.apache.flink.streaming.api.datastream.DataStream;
|
|
|
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
|
@@ -40,6 +44,7 @@ public class AdHourStreamJob {
|
|
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
|
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
|
|
+ env.setParallelism(6);
|
|
|
|
|
|
// 加载配置文件到 flink的全局配置中
|
|
|
Properties props = new Properties();
|
|
@@ -50,7 +55,6 @@ public class AdHourStreamJob {
|
|
|
configuration.setString(key.trim(), StringUtils.isBlank(value) ? "" : value.trim());
|
|
|
});
|
|
|
env.getConfig().setGlobalJobParameters(configuration);
|
|
|
- int parallelismKafka = Integer.parseInt(props.getProperty(ApplicationProperties.FLINK_PARALLELISM_KAFKA));
|
|
|
|
|
|
// checkpoint配置
|
|
|
env.enableCheckpointing(5 * 60 * 1000L, CheckpointingMode.EXACTLY_ONCE);
|
|
@@ -67,13 +71,16 @@ public class AdHourStreamJob {
|
|
|
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
|
|
|
// 大概是允许 checkpoint失败几次,默认 0
|
|
|
env.getCheckpointConfig().setTolerableCheckpointFailureNumber(0);
|
|
|
- env.setStateBackend(new EmbeddedRocksDBStateBackend(true));
|
|
|
+
|
|
|
+ EmbeddedRocksDBStateBackend stateBackend = new EmbeddedRocksDBStateBackend(true);
|
|
|
+ stateBackend.setPredefinedOptions(PredefinedOptions.SPINNING_DISK_OPTIMIZED_HIGH_MEM);
|
|
|
+ env.setStateBackend(stateBackend);
|
|
|
if (StringUtils.isNotBlank(props.getProperty(ApplicationProperties.FLINK_CHECKPOINT_SAVEPOINT))) {
|
|
|
env.getCheckpointConfig().setCheckpointStorage(props.getProperty(ApplicationProperties.FLINK_CHECKPOINT_SAVEPOINT));
|
|
|
}
|
|
|
|
|
|
KafkaSource<String> adStreamOfMinuteSource = KafkaComponent.buildKafkaSource(props, KafkaComponent.KafkaTopic.adHourTopic, KafkaComponent.KafkaTopic.KafkaGroupId.adHourConsumerGroup);
|
|
|
- DataStreamSource<String> adStreamOfMinuteIn = env.fromSource(adStreamOfMinuteSource, WatermarkStrategy.noWatermarks(), "adHourSource_kafka").setParallelism(parallelismKafka);
|
|
|
+ DataStreamSource<String> adStreamOfMinuteIn = env.fromSource(adStreamOfMinuteSource, WatermarkStrategy.noWatermarks(), "adHourSource_kafka").setParallelism(12);
|
|
|
|
|
|
// 广告分钟数据(前 5分钟的广告消耗数据)
|
|
|
final OutputTag<AdDataOfMinuteODS> adMinuteStreamTag = new OutputTag<AdDataOfMinuteODS>("adMinuteStream") {
|
|
@@ -84,13 +91,13 @@ public class AdHourStreamJob {
|
|
|
|
|
|
// 对流进行映射,拆分(实时的分钟流和回滚的小时流)
|
|
|
SingleOutputStreamOperator<AdDataOfMinuteODS> adODSStream = adStreamOfMinuteIn
|
|
|
- .filter(StringUtils::isNotBlank).setParallelism(parallelismKafka)
|
|
|
- .process(new AdHourDTOStreamProcess(adMinuteStreamTag, adHourStreamTag)).setParallelism(parallelismKafka);
|
|
|
+ .filter(StringUtils::isNotBlank).setParallelism(12)
|
|
|
+ .process(new AdHourDTOStreamProcess(adMinuteStreamTag, adHourStreamTag)).setParallelism(12);
|
|
|
|
|
|
// 分钟流
|
|
|
DataStream<AdDataOfMinuteODS> adMinuteODSStream = adODSStream.getSideOutput(adMinuteStreamTag);
|
|
|
// 分钟流-写入原始表
|
|
|
- new KeyedBatchStream<>("adMinuteODSStream", adMinuteODSStream.keyBy(AdDataOfMinuteODS::getStatDay), 6000L, 2 * 60 * 1000L)
|
|
|
+ new KeyedBatchStream<>(adMinuteODSStream.keyBy(AdDataOfMinuteODS::getStatDay), 6000L, Time.minutes(3L))
|
|
|
.toBatch()
|
|
|
.setParallelism(12)
|
|
|
.addSink(new TunnelBatchStreamSink<>(AdDataOfMinuteODS.class))
|
|
@@ -107,12 +114,12 @@ public class AdHourStreamJob {
|
|
|
.window(TumblingEventTimeWindows.of(Time.minutes(5L)))
|
|
|
.trigger(new AdMinuteODSStreamTrigger())
|
|
|
.process(new AdMinuteDWDProcess())
|
|
|
- .setParallelism(parallelismKafka);
|
|
|
- new KeyedBatchStream<>("adMinuteDWDStream", adMinuteDWDStream.keyBy(AdStatOfMinuteDWD::getStatDay), 3000L, 60 * 1000L)
|
|
|
+ .setParallelism(12);
|
|
|
+ new KeyedBatchStream<>(adMinuteDWDStream.keyBy(AdStatOfMinuteDWD::getStatDay), 5000L, Time.minutes(3L))
|
|
|
.toBatch()
|
|
|
- .setParallelism(6)
|
|
|
+ .setParallelism(12)
|
|
|
.addSink(new TunnelBatchStreamSink<>(AdStatOfMinuteDWD.class))
|
|
|
- .setParallelism(6)
|
|
|
+ .setParallelism(12)
|
|
|
.name("sink_ad_minute_dwd");
|
|
|
|
|
|
//分钟流-写入 ck
|
|
@@ -120,7 +127,7 @@ public class AdHourStreamJob {
|
|
|
adMinuteDWDStream
|
|
|
.keyBy(AdStatOfMinuteDWD::getAdId)
|
|
|
.process(new CostMinuteProcess());
|
|
|
- new BatchStream<>("adMinuteDMStream", clickhouseMinuteDmStream, 3000L, 60 * 1000L)
|
|
|
+ new BatchStream<>(clickhouseMinuteDmStream, 3000L, Time.minutes(1L))
|
|
|
.toBatch()
|
|
|
.addSink(new AdMinuteDMToCkBatchSink())
|
|
|
.name("sink_ad_minute_dm_clickhouse");
|
|
@@ -128,11 +135,11 @@ public class AdHourStreamJob {
|
|
|
// 小时流
|
|
|
DataStream<AdDataOfHourODS> adHourODSStream = adODSStream.getSideOutput(adHourStreamTag);
|
|
|
// 小时流-写入原始表
|
|
|
- new KeyedBatchStream<>("adHourODSStream", adHourODSStream.keyBy(AdDataOfHourODS::getStatDay), 3000L, 3 * 60 * 1000L)
|
|
|
+ new KeyedBatchStream<>(adHourODSStream.keyBy(AdDataOfHourODS::getStatDay), 3000L, Time.minutes(3L))
|
|
|
.toBatch()
|
|
|
- .setParallelism(6)
|
|
|
+ .setParallelism(12)
|
|
|
.addSink(new TunnelBatchStreamSink<>(AdDataOfHourODS.class))
|
|
|
- .setParallelism(6)
|
|
|
+ .setParallelism(12)
|
|
|
.name("sink_ad_hour_ods");
|
|
|
|
|
|
// 小时流-计算
|
|
@@ -141,11 +148,11 @@ public class AdHourStreamJob {
|
|
|
.process(new AdHourDWDProcess());
|
|
|
|
|
|
// 小时流-写入maxCompute
|
|
|
- new KeyedBatchStream<>("adHourDWDStream", adHourDWDStream.keyBy(AdStatOfHourDWD::getStatDay), 3000L, 3 * 60 * 1000L)
|
|
|
+ new KeyedBatchStream<>(adHourDWDStream.keyBy(AdStatOfHourDWD::getStatDay), 3000L, Time.minutes(3L))
|
|
|
.toBatch()
|
|
|
- .setParallelism(6)
|
|
|
+ .setParallelism(12)
|
|
|
.addSink(new TunnelBatchStreamSink<>(AdStatOfHourDWD.class))
|
|
|
- .setParallelism(6)
|
|
|
+ .setParallelism(12)
|
|
|
.name("sink_ad_hour_dwd");
|
|
|
|
|
|
// 分钟流转小时流同时填充空白的小时
|
|
@@ -159,7 +166,7 @@ public class AdHourStreamJob {
|
|
|
adHourDWDAllStream
|
|
|
.keyBy(AdStatOfHourDWD::getAdId)
|
|
|
.process(new CostHourProcess());
|
|
|
- new BatchStream<>("adHourDMStream", adHourDMStream, 2000L, 60 * 1000L)
|
|
|
+ new BatchStream<>(adHourDMStream, 3000L, Time.minutes(1L))
|
|
|
.toBatch()
|
|
|
.addSink(new AdHourDMToCkBatchSink())
|
|
|
.name("sink_ad_hour_dm_clickhouse");
|
|
@@ -169,9 +176,11 @@ public class AdHourStreamJob {
|
|
|
.keyBy(AdStatOfDayDWD::getAdId)
|
|
|
.process(new AdDayOnTimeStreamCompletionProcess());
|
|
|
// 写入 ck
|
|
|
- new BatchStream<>("adDayDWDToCkStream", dayStreamFromHour, 500L, 60 * 1000L)
|
|
|
+ new BatchStream<>(dayStreamFromHour, 3000L, Time.minutes(1L))
|
|
|
.toBatch()
|
|
|
+ .setParallelism(1)
|
|
|
.addSink(new AdDayDWDToDBBatchSink())
|
|
|
+ .setParallelism(1)
|
|
|
.name("ad_day_dwd_from_hour_sink");
|
|
|
|
|
|
env.execute("ad_hour_stream_job");
|