Flink(五) 【消费kafka】
阅读原文时间:2023年07月08日阅读:1

目录

0.目的

测试flink消费kafka的几种消费策略

kafkaSource.setStartFromEarliest() //从起始位置
kafkaSource.setStartFromLatest() //从最新位置
kafkaSource.setStartFromTimestamp("起始时间") //从指定时间开始消费
kafkaSource.setStartFromGroupOffsets() //默认
kafkaSource.setStartFromSpecificOffsets() //指定offset

1.本地测试

package flink_01_connector.source

import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
import scala.collection.JavaConverters._

/**
 * @description: kafka connector
 * @author: HaoWu
 * @create: 2020年12月16日
 */
object KafkaConnectorTest {
  def main(args: Array[String]): Unit = {
    // 0 初始化环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    // 1 构建 Kafka Source
    val topics = List("xes_test_anwser_detail").asJava
    val props = new Properties()
    props.put("bootstrap.servers", "kafka地址")
    props.put("group.id", "test5")
    props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") //key 反序列化
    props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") //value 反序列化

    /*  props.put("enable.auto.commit", "true") //自动提交
        props.put("auto.commit.interval.ms", "1500") //提交
        props.put("auto.offset.reset", "lastest") //offset从最新的位置开始读取*/
    val kafkaSource = new FlinkKafkaConsumer011[String](topics, new SimpleStringSchema(), props)

    val lag = System.currentTimeMillis() - 24 * 3600 * 1000
    kafkaSource.setStartFromTimestamp(lag) // 从前几小时开始消费

    // 2 获取流
    val kafkaStream: DataStream[String] = env.addSource(kafkaSource)

    // 3 打印
    kafkaStream.print()

    // 4 执行
    env.execute()
  }
}

2.线上测试

package flink_01_connector.source

import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
import scala.collection.JavaConverters._

/**
 * @description: 读取kafka流
 * @author: HaoWu
 * @create: 2020年12月16日
 */
object KafkaConnectorOnlineTest {
  def main(args: Array[String]): Unit = {
    // 0 初始化环境
    val parameterTool: ParameterTool = ParameterTool.fromArgs(args)
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    val topic: String = parameterTool.get("topic") // topic:可设置多个,逗号分隔
    val bootStrapServer: String = parameterTool.get("bootstrap_server") // kafka集群url
    val groupId: String = parameterTool.get("group_id") // 消费者组
    val hours: Int = parameterTool.get("hours").toInt // 从几小时前开始消费

    //    val keyTabPath = parameterTool.get("keytab_path") // 安全验证
    //    env.registerCachedFile(keyTabPath, "keytab")
    // 设置全局参数
    env.getConfig.setGlobalJobParameters(parameterTool)
    // 1 构建 Kafka Source
    val topics = topic.split(",").toList.asJava
    val props = new Properties()
    props.put("bootstrap.servers", bootStrapServer)
    props.put("group.id", groupId)
    props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") //key 反序列化
    props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") //value 反序列化
    val kafkaSource = new FlinkKafkaConsumer011[String](topics, new SimpleStringSchema(), props)

    val lag = System.currentTimeMillis() - hours * 3600 * 1000
    kafkaSource.setStartFromTimestamp(lag) // 从前几小时开始消费
    //    kafkaSource.setStartFromEarliest() // 从最开始消费
    // 2 获取流
    val kafkaStream: DataStream[String] = env.addSource(kafkaSource)

    // 3 打印
    kafkaStream.print("| log |")

    // 4 执行
    env.execute()
  }
}

提交作业

#!/bin/bash

source  ~/.bashrc

cd $(dirname $0)
day=$(date +%Y%m%d%H%M)

#flink
jobName=KafkaConnectorOnlineTest_wuhao
clazz=flink_01_connector.source.KafkaConnectorOnlineTest
jar_path=/home//wuhao/flink-learning/jar/02_flink_learning-1.0-SNAPSHOT-jar-with-dependencies.jar
parallelism=2
sourceParallelism=4

#kafka  bootstrap_server
bootstrap_server=kafka url
topic=xes_test_anwser_detail
group_id=KafkaConnectorOnlineTest_wuhao
hours=24

#kudu
kudu_instance=1v6_common_edc_online_answer
kudu_host=****:7051
kudu_flush_num=5

#-----------------------run----------------------------------------------
/software/servers/flink1.9.1_wx_dp_hive/bin/flink run -m yarn-cluster \
-ynm ${jobName} \
-yqu root.wangxiao.dp \
-c ${clazz} ${jar_path} \
--jobName ${jobName} \
--keytab_path /home/wx_dp_hive/wx_dp_hive.keytab \
--bootstrap_server ${bootstrap_server} \
--topic ${topic} \
--group_id ${group_id} \
--isSecurity ${isSecurity} \
--consumerStrategy ${consumerStrategy} \
--hours ${hours} \
--parallelism ${parallelism} \
--sourceParallelism ${sourceParallelism} \
--kudu_instance ${kudu_instance} \
--kudu_host ${kudu_host} \
--kudu_flush_num ${kudu_flush_num} >../logs/${jobName}_${day}.log 2>&1 &

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器

你可能感兴趣的文章