1.checkpoint
object checkpointMain {
def main(args: Array[String]): Unit = {
//创建执行的环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//设置并行度
env.setParallelism(1)
env.enableCheckpointing(1000) //开启1s保存一个快照
//env.setStateBackend(new FsStateBackend("hdfs://192.168.1.150:9000/flink-checkpoint")) //设置保存的路径 todo:新的版本的flink还没有和Hadoop集成
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE) //最少或者最多执行一次
env.getCheckpointConfig.setMaxConcurrentCheckpoints(1) //设置同时可以执行多少checkpoint
val orderDataStream = env.addSource(new RichSourceFunction[(Int, String, Double)] {
var runing = true
override def run(sourceContext: SourceFunction.SourceContext[(Int, String, Double)]): Unit = {
while (runing) {
for (i <- 0 until 10000) {
sourceContext.collect {
(i, "zhangsan", RandomStringUtils.randomNumeric(3).toDouble)
}
TimeUnit.SECONDS.sleep(1)
}
}
}
override def cancel(): Unit = {
runing = false
}
})
orderDataStream.keyBy(_._2)
.timeWindow(Time.seconds(3))
.apply(new myWindowFunction )
.print()
//执行job
env.execute()
}
class myWindowFunction extends WindowFunction[(Int, String, Double), Double, String,
TimeWindow] with ListCheckpointed[UDFState] {
var totalMoney = 0.0
override def apply(key: String, window: TimeWindow, input: Iterable[(Int, String, Double)], out: Collector[Double]): Unit = {
for (elem <- input) {
totalMoney =totalMoney+elem._3
}
out.collect(totalMoney)
}
override def snapshotState(l: Long, l1: Long): util.List[UDFState] = {
val states = new util.ArrayList[UDFState]()
states
}
override def restoreState(list: util.List[UDFState]): Unit = {
val totalMoney: Double = list.get(0).totalMoney
}
}
case class UDFState(var totalMoney:Double)
}
2.窗口函数
IngestionTime flink的source接收到数据的时间
ProcessingTime flink的节点执行某个operation的时间
窗口函数可以分为两大类 CountWindow(指定条数生成一个window) TimeWindow(设置好的时间生成一个window)
TimeWindow可以分为:滚动窗口,滑动窗口和会话窗口
滚动窗口:
滚动窗口分配器将每个元素分配到一个指定窗口大小的窗口中,滚动窗口有一个固定的大小,并且不会出现重叠。例如:如果你指定了一个5分钟大小的滚动窗口,窗口的创建如下图所示:
滑动窗口
滑动窗口分配器将元素分配到固定长度的窗口中,与滚动窗口类似,窗口的大小由窗口大小参数来配置,另一个窗口滑动参数控制滑动窗口开始的频率。因此,滑动窗口如果滑动参数小于窗口大小的话,窗口是可以重叠的,在这种情况下元素会被分配到多个窗口中。
会话窗口: 会话结束窗口就会消失
//创建执行的环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//设置并行度
var resout= env.setParallelism(1)
env.socketTextStream("localhost",9999)
.flatMap(x=>x.split(","))
.map((_,1))
.keyBy(_._1)
.timeWindow(Time.seconds(10),Time.seconds(5))
.sum(1)
.print()
//执行job
env.execute()
注意:Window 执行 nc -L -p 9999
当窗口计算时间大于了窗口时间 数据就回丢失
当窗口计算时间小于了窗口时间,数据就会重复计算
3.apply方法
实现一个windowFunction
指定该类型泛型[输入数据类型, 输出数据类型, keyBy中使用分组字段的类型, 窗口类型]
//创建执行的环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//设置并行度
env.setParallelism(1)
var resout =env.socketTextStream("localhost",9999)
.flatMap(_.split((",")))
.map((_,1))
.keyBy(_._1) //keyby 相当groupby 分组 分流
.timeWindow(Time.seconds(5)) //设置5秒处理一个窗口
.apply(new WindowFunction[(String,Int),(String,Int),String,TimeWindow] {
override def apply(key: String, window: TimeWindow, input: Iterable[(String, Int)], out: Collector[(String, Int)]): Unit = {
var inputDS= input.reduce{
(r1,r2) =>(r1._1,r1._2+r2._2)}
out.collect(inputDS)
}
}).print()
//执行job
env.execute()}
4.Watermark 案例
def main(args: Array[String]): Unit = {
//创建执行的环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//设置并行度
env.setParallelism(1)
//设置处理时间是EventTime
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
//创建自定义的数据源
var orderDataStream =env.addSource(new myRichSourceFunction )
orderDataStream.print() //测试输出
//添加水印
val orderDS: DataStream[Order] = orderDataStream.assignTimestampsAndWatermarks(new MyorderDataStream)
orderDS.keyBy(_.userId)
.timeWindow(Time.seconds(5))
.reduce(
(r1,r2)=> Order(r2.oderID,r2.userId,r2.monery+r1.monery,0))
.writeAsText("./output/2.txt")
//执行job
env.execute()
case class Order (oderID:String,userId:Int,monery:Long,timestamp:Long)
class myRichSourceFunction extends RichSourceFunction[Order]{
var running =true
override def run(sourceContext: SourceFunction.SourceContext[Order]): Unit ={
while(running){
//订单ID UUID
// 随机生成用户ID
//随机生成订单金额
//时间戳为系统时间
//每隔1S秒生成一个订单
var order =Order(UUID.randomUUID().toString,Random.nextInt(3),Random.nextInt(101),new Date().getTime)
sourceContext.collect(order)
//TimeUnit.SECONDS.sleep(1)
}
}
override def cancel(): Unit = {
running=false
}
}
class MyorderDataStream extends AssignerWithPeriodicWatermarks[Order]{
var currentTimestamp =0L
val delayTime =2000
override def getCurrentWatermark: Watermark = {
//水印的时间,当前事件时间,当前系统时间
val watermark = new Watermark(currentTimestamp-delayTime)
val dateFormat: FastDateFormat = FastDateFormat.getInstance("HH:mm:ss")
println(
s"当前的系统时间:${dateFormat.format(watermark.getTimestamp)}," +
s"当前的事件时间:${dateFormat.format(currentTimestamp)}, " +
s"当前系统时间:${dateFormat.format(System.currentTimeMillis())}"
)
watermark
}
override def extractTimestamp(t: Order, l: Long): Long = {
val timestamp: Long = t.timestamp
currentTimestamp=Math.max(currentTimestamp,timestamp)
currentTimestamp
}
}